VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 97236

最後變更 在這個檔案從97236是 97236,由 vboxsync 提交於 2 年 前

VMM/IEM: Rough implementation for fptan instruction in IEM, bugref:9898

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 643.9 KB
 
1/* $Id: IEMAllAImplC.cpp 97236 2022-10-19 10:44:15Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2022 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.alldomusa.eu.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6124{
6125 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6126 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6127 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6128 extFloat80_t v;
6129 (void)fFcw;
6130
6131 v = extF80_atan2(y, x, &SoftState);
6132
6133 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6134 return fFsw;
6135}
6136
6137IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6138 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6139{
6140 uint16_t const fFcw = pFpuState->FCW;
6141 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6142
6143 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6144 {
6145 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6146
6147 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6148 if (!(fFcw & X86_FCW_PM))
6149 fFsw |= X86_FSW_ES | X86_FSW_B;
6150 }
6151 else
6152 {
6153 fFsw |= X86_FSW_IE;
6154 if (!(fFcw & X86_FCW_IM))
6155 {
6156 pFpuRes->r80Result = *pr80Val2;
6157 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6158 }
6159 else
6160 {
6161 pFpuRes->r80Result = g_r80Indefinite;
6162 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6163 }
6164 }
6165
6166 pFpuRes->FSW = fFsw;
6167}
6168#endif /* IEM_WITHOUT_ASSEMBLY */
6169
6170IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6171 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6172{
6173 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6174}
6175
6176IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6177 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6178{
6179 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6180}
6181
6182
6183#if defined(IEM_WITHOUT_ASSEMBLY)
6184static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6185{
6186 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6187 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6188 extFloat80_t v;
6189 (void)fFcw;
6190
6191 v = extF80_tan(x, &SoftState);
6192
6193 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6194 return fFsw;
6195}
6196
6197IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6198{
6199 uint16_t const fFcw = pFpuState->FCW;
6200 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6201
6202 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6203 {
6204 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6205 {
6206 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6207 pFpuResTwo->r80Result1 = *pr80Val;
6208 }
6209 else
6210 {
6211 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6212 {
6213 pFpuResTwo->r80Result1 = *pr80Val;
6214 }
6215 else
6216 {
6217 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6218 }
6219
6220 pFpuResTwo->r80Result2 = g_ar80One[0];
6221
6222 fFsw |= X86_FSW_PE;
6223 if (!(fFcw & X86_FCW_PM))
6224 fFsw |= X86_FSW_ES | X86_FSW_B;
6225 }
6226 }
6227 else
6228 {
6229 fFsw |= X86_FSW_IE;
6230 if (!(fFcw & X86_FCW_IM))
6231 fFsw |= X86_FSW_ES | X86_FSW_B;
6232 }
6233
6234 pFpuResTwo->FSW = fFsw;
6235}
6236#endif /* IEM_WITHOUT_ASSEMBLY */
6237
6238IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6239{
6240 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6241}
6242
6243IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6244{
6245 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6246}
6247
6248#ifdef IEM_WITHOUT_ASSEMBLY
6249
6250static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6251{
6252 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6253 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6254 extFloat80_t v;
6255 (void)fFcw;
6256
6257 v = extF80_sin(x, &SoftState);
6258
6259 iemFpuSoftF80ToIprt(pr80Result, v);
6260
6261 return fFsw;
6262}
6263
6264IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6265{
6266 uint16_t const fFcw = pFpuState->FCW;
6267 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6268
6269 if (RTFLOAT80U_IS_ZERO(pr80Val))
6270 {
6271 pFpuRes->r80Result = *pr80Val;
6272 }
6273 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6274 {
6275 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6276 {
6277 fFsw |= X86_FSW_C2;
6278 pFpuRes->r80Result = *pr80Val;
6279 }
6280 else
6281 {
6282 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6283 {
6284 pFpuRes->r80Result = *pr80Val;
6285
6286 }
6287 else
6288 {
6289 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6290 }
6291 fFsw |= X86_FSW_PE;
6292 if (!(fFcw & X86_FCW_PM))
6293 fFsw |= X86_FSW_ES | X86_FSW_B;
6294 }
6295 }
6296 else if (RTFLOAT80U_IS_INF(pr80Val))
6297 {
6298 fFsw |= X86_FSW_IE;
6299 if (!(fFcw & X86_FCW_IM))
6300 {
6301 fFsw |= X86_FSW_ES | X86_FSW_B;
6302 pFpuRes->r80Result = *pr80Val;
6303 }
6304 else
6305 {
6306 pFpuRes->r80Result = g_r80Indefinite;
6307 }
6308 }
6309 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6310 {
6311 pFpuRes->r80Result = *pr80Val;
6312 fFsw |= X86_FSW_DE;
6313
6314 if (fFcw & X86_FCW_DM)
6315 {
6316 fFsw |= X86_FSW_UE | X86_FSW_PE;
6317
6318 if (!(fFcw & X86_FCW_UM) || !(fFcw & X86_FCW_PM))
6319 {
6320 fFsw |= X86_FSW_ES | X86_FSW_B;
6321 }
6322 }
6323 else
6324 {
6325 fFsw |= X86_FSW_ES | X86_FSW_B;
6326 }
6327 }
6328 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6329 {
6330 pFpuRes->r80Result = *pr80Val;
6331 fFsw |= X86_FSW_DE;
6332
6333 if (fFcw & X86_FCW_DM)
6334 {
6335 if (fFcw & X86_FCW_PM)
6336 {
6337 fFsw |= X86_FSW_PE;
6338 }
6339 else
6340 {
6341 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6342 }
6343
6344 pFpuRes->r80Result.sj64.uExponent = 1;
6345 }
6346 else
6347 {
6348 fFsw |= X86_FSW_ES | X86_FSW_B;
6349 }
6350 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6351 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6352 {
6353 pFpuRes->r80Result = *pr80Val;
6354 } else {
6355 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6356 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6357 && (fFcw & X86_FCW_IM))
6358 pFpuRes->r80Result = g_r80Indefinite;
6359 else
6360 {
6361 pFpuRes->r80Result = *pr80Val;
6362 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6363 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6364 }
6365
6366 fFsw |= X86_FSW_IE;
6367 if (!(fFcw & X86_FCW_IM))
6368 fFsw |= X86_FSW_ES | X86_FSW_B;
6369 }
6370
6371 pFpuRes->FSW = fFsw;
6372}
6373#endif /* IEM_WITHOUT_ASSEMBLY */
6374
6375IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6376{
6377 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6378}
6379
6380IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6381{
6382 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6383}
6384
6385#ifdef IEM_WITHOUT_ASSEMBLY
6386
6387static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6388{
6389 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6390 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6391 extFloat80_t v;
6392 (void)fFcw;
6393
6394 v = extF80_cos(x, &SoftState);
6395
6396 iemFpuSoftF80ToIprt(pr80Result, v);
6397
6398 return fFsw;
6399}
6400
6401IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6402{
6403 uint16_t const fFcw = pFpuState->FCW;
6404 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6405
6406 if (RTFLOAT80U_IS_ZERO(pr80Val))
6407 {
6408 pFpuRes->r80Result = g_ar80One[0];
6409 }
6410 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6411 {
6412 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6413 {
6414 fFsw |= X86_FSW_C2;
6415 pFpuRes->r80Result = *pr80Val;
6416 }
6417 else
6418 {
6419 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6420 {
6421 pFpuRes->r80Result = g_ar80One[0];
6422
6423 }
6424 else
6425 {
6426 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6427 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6428 }
6429 fFsw |= X86_FSW_PE;
6430 if (!(fFcw & X86_FCW_PM))
6431 fFsw |= X86_FSW_ES | X86_FSW_B;
6432 }
6433 }
6434 else if (RTFLOAT80U_IS_INF(pr80Val))
6435 {
6436 fFsw |= X86_FSW_IE;
6437 if (!(fFcw & X86_FCW_IM))
6438 {
6439 fFsw |= X86_FSW_ES | X86_FSW_B;
6440 pFpuRes->r80Result = *pr80Val;
6441 }
6442 else
6443 {
6444 pFpuRes->r80Result = g_r80Indefinite;
6445 }
6446 }
6447 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6448 {
6449 fFsw |= X86_FSW_DE;
6450
6451 if (fFcw & X86_FCW_DM)
6452 {
6453 pFpuRes->r80Result = g_ar80One[0];
6454
6455 if (fFcw & X86_FCW_PM)
6456 {
6457 fFsw |= X86_FSW_PE;
6458 }
6459 else
6460 {
6461 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6462 }
6463 }
6464 else
6465 {
6466 pFpuRes->r80Result = *pr80Val;
6467 fFsw |= X86_FSW_ES | X86_FSW_B;
6468 }
6469 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6470 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6471 {
6472 pFpuRes->r80Result = *pr80Val;
6473 } else {
6474 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6475 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6476 && (fFcw & X86_FCW_IM))
6477 pFpuRes->r80Result = g_r80Indefinite;
6478 else
6479 {
6480 pFpuRes->r80Result = *pr80Val;
6481 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6482 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6483 }
6484
6485 fFsw |= X86_FSW_IE;
6486 if (!(fFcw & X86_FCW_IM))
6487 fFsw |= X86_FSW_ES | X86_FSW_B;
6488 }
6489
6490 pFpuRes->FSW = fFsw;
6491}
6492#endif /* IEM_WITHOUT_ASSEMBLY */
6493
6494IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6495{
6496 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6497}
6498
6499IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6500{
6501 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6502}
6503
6504#ifdef IEM_WITHOUT_ASSEMBLY
6505
6506static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6507{
6508 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6509 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6510 extFloat80_t r80Sin, r80Cos;
6511 (void)fFcw;
6512
6513 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6514
6515 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6516 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6517
6518 return fFsw;
6519}
6520
6521IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6522{
6523 uint16_t const fFcw = pFpuState->FCW;
6524 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6525
6526 if (RTFLOAT80U_IS_ZERO(pr80Val))
6527 {
6528 pFpuResTwo->r80Result1 = *pr80Val;
6529 pFpuResTwo->r80Result2 = g_ar80One[0];
6530 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6531 }
6532 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6533 {
6534 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6535 {
6536 fFsw |= X86_FSW_C2;
6537
6538 if (fFcw & X86_FCW_IM)
6539 {
6540 pFpuResTwo->r80Result1 = g_r80Indefinite;
6541 }
6542 else
6543 {
6544 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6545 }
6546
6547 pFpuResTwo->r80Result2 = *pr80Val;
6548 }
6549 else
6550 {
6551 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6552
6553 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6554 {
6555 pFpuResTwo->r80Result1 = *pr80Val;
6556 pFpuResTwo->r80Result2 = g_ar80One[0];
6557 }
6558 else
6559 {
6560 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6561 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6562 }
6563 fFsw |= X86_FSW_PE;
6564 if (!(fFcw & X86_FCW_PM))
6565 fFsw |= X86_FSW_ES | X86_FSW_B;
6566 }
6567 }
6568 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6569 {
6570 fFsw |= X86_FSW_DE;
6571
6572 if (fFcw & X86_FCW_DM)
6573 {
6574 pFpuResTwo->r80Result1 = *pr80Val;
6575 pFpuResTwo->r80Result2 = g_ar80One[0];
6576 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6577
6578 if (fFcw & X86_FCW_PM)
6579 {
6580 fFsw |= X86_FSW_PE;
6581 }
6582 else
6583 {
6584 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6585 }
6586
6587 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6588 }
6589 else
6590 {
6591 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6592 pFpuResTwo->r80Result2 = *pr80Val;
6593 fFsw |= X86_FSW_ES | X86_FSW_B;
6594 }
6595 }
6596 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6597 {
6598 fFsw |= X86_FSW_DE;
6599
6600 if (fFcw & X86_FCW_DM)
6601 {
6602 pFpuResTwo->r80Result1 = *pr80Val;
6603 pFpuResTwo->r80Result2 = g_ar80One[0];
6604
6605 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6606 fFsw |= X86_FSW_UE | X86_FSW_PE;
6607
6608 if (fFcw & X86_FCW_PM)
6609 {
6610 if (!(fFcw & X86_FCW_UM))
6611 fFsw |= X86_FSW_ES | X86_FSW_B;
6612 }
6613 else
6614 {
6615 fFsw |= X86_FSW_ES | X86_FSW_B;
6616 }
6617 }
6618 else
6619 {
6620 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6621 pFpuResTwo->r80Result2 = *pr80Val;
6622 fFsw |= X86_FSW_ES | X86_FSW_B;
6623 }
6624 }
6625 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6626 {
6627 pFpuResTwo->r80Result1 = *pr80Val;
6628 pFpuResTwo->r80Result2 = *pr80Val;
6629 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6630 }
6631 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6632 {
6633 if (fFcw & X86_FCW_IM)
6634 {
6635 pFpuResTwo->r80Result1 = g_r80Indefinite;
6636 pFpuResTwo->r80Result2 = g_r80Indefinite;
6637 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6638 }
6639 else
6640 {
6641 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6642 pFpuResTwo->r80Result2 = *pr80Val;
6643 }
6644
6645 fFsw |= X86_FSW_IE;
6646 if (!(fFcw & X86_FCW_IM))
6647 fFsw |= X86_FSW_ES | X86_FSW_B;
6648 }
6649 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6650 {
6651 pFpuResTwo->r80Result1 = *pr80Val;
6652 pFpuResTwo->r80Result2 = *pr80Val;
6653
6654 if (fFcw & X86_FCW_IM)
6655 {
6656 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6657 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6658 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6659 }
6660 else
6661 {
6662 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6663 pFpuResTwo->r80Result2 = *pr80Val;
6664 }
6665
6666 fFsw |= X86_FSW_IE;
6667 if (!(fFcw & X86_FCW_IM))
6668 fFsw |= X86_FSW_ES | X86_FSW_B;
6669 }
6670 else if (RTFLOAT80U_IS_INF(pr80Val))
6671 {
6672 if (fFcw & X86_FCW_IM)
6673 {
6674 pFpuResTwo->r80Result1 = g_r80Indefinite;
6675 pFpuResTwo->r80Result2 = g_r80Indefinite;
6676 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6677 }
6678 else
6679 {
6680 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6681 pFpuResTwo->r80Result2 = *pr80Val;
6682 }
6683
6684 fFsw |= X86_FSW_IE;
6685 if (!(fFcw & X86_FCW_IM))
6686 fFsw |= X86_FSW_ES | X86_FSW_B;
6687 }
6688
6689 pFpuResTwo->FSW = fFsw;
6690}
6691#endif /* IEM_WITHOUT_ASSEMBLY */
6692
6693IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6694{
6695 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6696}
6697
6698IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6699{
6700 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6701}
6702
6703#ifdef IEM_WITHOUT_ASSEMBLY
6704
6705
6706/*********************************************************************************************************************************
6707* x87 FPU Compare and Testing Operations *
6708*********************************************************************************************************************************/
6709
6710IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6711{
6712 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6713
6714 if (RTFLOAT80U_IS_ZERO(pr80Val))
6715 fFsw |= X86_FSW_C3;
6716 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6717 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6718 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6719 {
6720 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6721 if (!(pFpuState->FCW & X86_FCW_DM))
6722 fFsw |= X86_FSW_ES | X86_FSW_B;
6723 }
6724 else
6725 {
6726 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6727 if (!(pFpuState->FCW & X86_FCW_IM))
6728 fFsw |= X86_FSW_ES | X86_FSW_B;
6729 }
6730
6731 *pu16Fsw = fFsw;
6732}
6733
6734
6735IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6736{
6737 RT_NOREF(pFpuState);
6738 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6739
6740 /* C1 = sign bit (always, even if empty Intel says). */
6741 if (pr80Val->s.fSign)
6742 fFsw |= X86_FSW_C1;
6743
6744 /* Classify the value in C0, C2, C3. */
6745 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6746 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6747 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6748 fFsw |= X86_FSW_C2;
6749 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6750 fFsw |= X86_FSW_C3;
6751 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6752 fFsw |= X86_FSW_C0;
6753 else if (RTFLOAT80U_IS_INF(pr80Val))
6754 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6755 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6756 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6757 /* whatever else: 0 */
6758
6759 *pu16Fsw = fFsw;
6760}
6761
6762
6763/**
6764 * Worker for fcom, fucom, and friends.
6765 */
6766static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6767 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6768{
6769 /*
6770 * Unpack the values.
6771 */
6772 bool const fSign1 = pr80Val1->s.fSign;
6773 int32_t iExponent1 = pr80Val1->s.uExponent;
6774 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6775
6776 bool const fSign2 = pr80Val2->s.fSign;
6777 int32_t iExponent2 = pr80Val2->s.uExponent;
6778 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6779
6780 /*
6781 * Check for invalid inputs.
6782 */
6783 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6784 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6785 {
6786 if (!(fFcw & X86_FCW_IM))
6787 fFsw |= X86_FSW_ES | X86_FSW_B;
6788 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6789 }
6790
6791 /*
6792 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6793 */
6794 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6795 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6796 {
6797 if ( fIeOnAllNaNs
6798 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6799 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6800 {
6801 fFsw |= X86_FSW_IE;
6802 if (!(fFcw & X86_FCW_IM))
6803 fFsw |= X86_FSW_ES | X86_FSW_B;
6804 }
6805 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6806 }
6807
6808 /*
6809 * Normalize the values.
6810 */
6811 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6812 {
6813 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6814 iExponent1 = 1;
6815 else
6816 {
6817 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6818 uMantissa1 <<= iExponent1;
6819 iExponent1 = 1 - iExponent1;
6820 }
6821 fFsw |= X86_FSW_DE;
6822 if (!(fFcw & X86_FCW_DM))
6823 fFsw |= X86_FSW_ES | X86_FSW_B;
6824 }
6825
6826 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6827 {
6828 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6829 iExponent2 = 1;
6830 else
6831 {
6832 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6833 uMantissa2 <<= iExponent2;
6834 iExponent2 = 1 - iExponent2;
6835 }
6836 fFsw |= X86_FSW_DE;
6837 if (!(fFcw & X86_FCW_DM))
6838 fFsw |= X86_FSW_ES | X86_FSW_B;
6839 }
6840
6841 /*
6842 * Test if equal (val1 == val2):
6843 */
6844 if ( uMantissa1 == uMantissa2
6845 && iExponent1 == iExponent2
6846 && ( fSign1 == fSign2
6847 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6848 fFsw |= X86_FSW_C3;
6849 /*
6850 * Test if less than (val1 < val2):
6851 */
6852 else if (fSign1 && !fSign2)
6853 fFsw |= X86_FSW_C0;
6854 else if (fSign1 == fSign2)
6855 {
6856 /* Zeros are problematic, however at the most one can be zero here. */
6857 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6858 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6859 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6860 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6861
6862 if ( fSign1
6863 ^ ( iExponent1 < iExponent2
6864 || ( iExponent1 == iExponent2
6865 && uMantissa1 < uMantissa2 ) ) )
6866 fFsw |= X86_FSW_C0;
6867 }
6868 /* else: No flags set if greater. */
6869
6870 return fFsw;
6871}
6872
6873
6874IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6875 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6876{
6877 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6878}
6879
6880
6881
6882
6883IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6884 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6885{
6886 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6887}
6888
6889
6890IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6891 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6892{
6893 RTFLOAT80U r80Val2;
6894 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6895 Assert(!fFsw || fFsw == X86_FSW_DE);
6896 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6897 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6898 {
6899 if (!(pFpuState->FCW & X86_FCW_DM))
6900 fFsw |= X86_FSW_ES | X86_FSW_B;
6901 *pfFsw |= fFsw;
6902 }
6903}
6904
6905
6906IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6907 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6908{
6909 RTFLOAT80U r80Val2;
6910 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6911 Assert(!fFsw || fFsw == X86_FSW_DE);
6912 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6913 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6914 {
6915 if (!(pFpuState->FCW & X86_FCW_DM))
6916 fFsw |= X86_FSW_ES | X86_FSW_B;
6917 *pfFsw |= fFsw;
6918 }
6919}
6920
6921
6922IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6923 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6924{
6925 RTFLOAT80U r80Val2;
6926 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6927 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6928}
6929
6930
6931IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6932 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6933{
6934 RTFLOAT80U r80Val2;
6935 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6936 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6937}
6938
6939
6940/**
6941 * Worker for fcomi & fucomi.
6942 */
6943static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6944 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6945{
6946 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6947 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6948 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6949 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6950
6951 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6952 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6953 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6954}
6955
6956
6957IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6958 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6959{
6960 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
6961}
6962
6963
6964IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6965 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6966{
6967 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
6968}
6969
6970
6971/*********************************************************************************************************************************
6972* x87 FPU Other Operations *
6973*********************************************************************************************************************************/
6974
6975/**
6976 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
6977 */
6978static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6979{
6980 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6981 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
6982 true /*exact / generate #PE */, &SoftState));
6983 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
6984}
6985
6986
6987IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6988{
6989 uint16_t const fFcw = pFpuState->FCW;
6990 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6991
6992 if (RTFLOAT80U_IS_NORMAL(pr80Val))
6993 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6994 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
6995 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6996 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
6997 || RTFLOAT80U_IS_INF(pr80Val))
6998 pFpuRes->r80Result = *pr80Val;
6999 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7000 {
7001 fFsw |= X86_FSW_DE;
7002 if (fFcw & X86_FCW_DM)
7003 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7004 else
7005 {
7006 pFpuRes->r80Result = *pr80Val;
7007 fFsw |= X86_FSW_ES | X86_FSW_B;
7008 }
7009 }
7010 else
7011 {
7012 if (fFcw & X86_FCW_IM)
7013 {
7014 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7015 pFpuRes->r80Result = g_r80Indefinite;
7016 else
7017 {
7018 pFpuRes->r80Result = *pr80Val;
7019 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7020 }
7021 }
7022 else
7023 {
7024 pFpuRes->r80Result = *pr80Val;
7025 fFsw |= X86_FSW_ES | X86_FSW_B;
7026 }
7027 fFsw |= X86_FSW_IE;
7028 }
7029 pFpuRes->FSW = fFsw;
7030}
7031
7032
7033IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7034 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7035{
7036 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7037 it does everything we need it to do. */
7038 uint16_t const fFcw = pFpuState->FCW;
7039 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7040 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7041 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7042 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7043}
7044
7045
7046/**
7047 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7048 */
7049static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7050{
7051 Assert(!pr80Val->s.fSign);
7052 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7053 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7054 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7055}
7056
7057
7058IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7059{
7060 uint16_t const fFcw = pFpuState->FCW;
7061 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7062
7063 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7064 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7065 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7066 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7067 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7068 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7069 pFpuRes->r80Result = *pr80Val;
7070 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7071 {
7072 fFsw |= X86_FSW_DE;
7073 if (fFcw & X86_FCW_DM)
7074 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7075 else
7076 {
7077 pFpuRes->r80Result = *pr80Val;
7078 fFsw |= X86_FSW_ES | X86_FSW_B;
7079 }
7080 }
7081 else
7082 {
7083 if (fFcw & X86_FCW_IM)
7084 {
7085 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7086 pFpuRes->r80Result = g_r80Indefinite;
7087 else
7088 {
7089 pFpuRes->r80Result = *pr80Val;
7090 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7091 }
7092 }
7093 else
7094 {
7095 pFpuRes->r80Result = *pr80Val;
7096 fFsw |= X86_FSW_ES | X86_FSW_B;
7097 }
7098 fFsw |= X86_FSW_IE;
7099 }
7100 pFpuRes->FSW = fFsw;
7101}
7102
7103
7104/**
7105 * @code{.unparsed}
7106 * x x * ln2
7107 * f(x) = 2 - 1 = e - 1
7108 *
7109 * @endcode
7110 *
7111 * We can approximate e^x by a Taylor/Maclaurin series (see
7112 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7113 * @code{.unparsed}
7114 * n 0 1 2 3 4
7115 * inf x x x x x x
7116 * SUM ----- = --- + --- + --- + --- + --- + ...
7117 * n=0 n! 0! 1! 2! 3! 4!
7118 *
7119 * 2 3 4
7120 * x x x
7121 * = 1 + x + --- + --- + --- + ...
7122 * 2! 3! 4!
7123 * @endcode
7124 *
7125 * Given z = x * ln2, we get:
7126 * @code{.unparsed}
7127 * 2 3 4 n
7128 * z z z z z
7129 * e - 1 = z + --- + --- + --- + ... + ---
7130 * 2! 3! 4! n!
7131 * @endcode
7132 *
7133 * Wanting to use Horner's method, we move one z outside and get:
7134 * @code{.unparsed}
7135 * 2 3 (n-1)
7136 * z z z z
7137 * = z ( 1 + --- + --- + --- + ... + ------- )
7138 * 2! 3! 4! n!
7139 * @endcode
7140 *
7141 * The constants we need for using Horner's methods are 1 and 1 / n!.
7142 *
7143 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7144 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7145 * and can approximate it to be 1.0. For a visual demonstration of this
7146 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7147 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7148 *
7149 *
7150 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7151 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7152 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7153 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7154 * blocks). (The one bit difference is probably an implicit one missing from
7155 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7156 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7157 * exponent.
7158 *
7159 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7160 * successfully reproduced the exact results from an Intel 10980XE, there is
7161 * always a portition of rounding differences. Not going to spend too much time
7162 * on getting this 100% the same, at least not now.
7163 *
7164 * P.S. If someone are really curious about 8087 and its contstants:
7165 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7166 *
7167 *
7168 * @param pr80Val The exponent value (x), less than 1.0, greater than
7169 * -1.0 and not zero. This can be a normal, denormal
7170 * or pseudo-denormal value.
7171 * @param pr80Result Where to return the result.
7172 * @param fFcw FPU control word.
7173 * @param fFsw FPU status word.
7174 */
7175static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7176{
7177 /* As mentioned above, we can skip the expensive polynomial calculation
7178 as it will be close enough to 1.0 that it makes no difference.
7179
7180 The cutoff point for intel 10980XE is exponents >= -69. Intel
7181 also seems to be using a 67-bit or 68-bit constant value, and we get
7182 a smattering of rounding differences if we go for higher precision. */
7183 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7184 {
7185 RTUINT256U u256;
7186 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7187 u256.QWords.qw0 |= 1; /* force #PE */
7188 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7189 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7190 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7191 : 1 - RTFLOAT80U_EXP_BIAS,
7192 fFcw, fFsw);
7193 }
7194 else
7195 {
7196#ifdef IEM_WITH_FLOAT128_FOR_FPU
7197 /* This approach is not good enough for small values, we end up with zero. */
7198 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7199 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7200 _Float128 rd128Result = powf128(2.0L, rd128Val);
7201 rd128Result -= 1.0L;
7202 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7203 iemFpuF128RestoreRounding(fOldRounding);
7204
7205# else
7206 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7207 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7208
7209 /* As mentioned above, enforce 68-bit internal mantissa width to better
7210 match the Intel 10980XE results. */
7211 unsigned const cPrecision = 68;
7212
7213 /* first calculate z = x * ln2 */
7214 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7215 cPrecision);
7216
7217 /* Then do the polynomial evaluation. */
7218 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7219 cPrecision, &SoftState);
7220 r = f128_mul(z, r, &SoftState);
7221
7222 /* Output the result. */
7223 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7224# endif
7225 }
7226 return fFsw;
7227}
7228
7229
7230IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7231{
7232 uint16_t const fFcw = pFpuState->FCW;
7233 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7234
7235 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7236 {
7237 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7238 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7239 else
7240 {
7241 /* Special case:
7242 2^+1.0 - 1.0 = 1.0
7243 2^-1.0 - 1.0 = -0.5 */
7244 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7245 && pr80Val->s.uMantissa == RT_BIT_64(63))
7246 {
7247 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7248 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7249 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7250 }
7251 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7252 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7253 else
7254 pFpuRes->r80Result = *pr80Val;
7255 fFsw |= X86_FSW_PE;
7256 if (!(fFcw & X86_FCW_PM))
7257 fFsw |= X86_FSW_ES | X86_FSW_B;
7258 }
7259 }
7260 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7261 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7262 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7263 pFpuRes->r80Result = *pr80Val;
7264 else if (RTFLOAT80U_IS_INF(pr80Val))
7265 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7266 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7267 {
7268 fFsw |= X86_FSW_DE;
7269 if (fFcw & X86_FCW_DM)
7270 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7271 else
7272 {
7273 pFpuRes->r80Result = *pr80Val;
7274 fFsw |= X86_FSW_ES | X86_FSW_B;
7275 }
7276 }
7277 else
7278 {
7279 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7280 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7281 && (fFcw & X86_FCW_IM))
7282 pFpuRes->r80Result = g_r80Indefinite;
7283 else
7284 {
7285 pFpuRes->r80Result = *pr80Val;
7286 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7287 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7288 }
7289 fFsw |= X86_FSW_IE;
7290 if (!(fFcw & X86_FCW_IM))
7291 fFsw |= X86_FSW_ES | X86_FSW_B;
7292 }
7293 pFpuRes->FSW = fFsw;
7294}
7295
7296#endif /* IEM_WITHOUT_ASSEMBLY */
7297
7298IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7299{
7300 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7301}
7302
7303IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7304{
7305 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7306}
7307
7308#ifdef IEM_WITHOUT_ASSEMBLY
7309
7310IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7311{
7312 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7313 pFpuRes->r80Result = *pr80Val;
7314 pFpuRes->r80Result.s.fSign = 0;
7315}
7316
7317
7318IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7319{
7320 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7321 pFpuRes->r80Result = *pr80Val;
7322 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7323}
7324
7325
7326IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7327{
7328 uint16_t const fFcw = pFpuState->FCW;
7329 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7330
7331 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7332 {
7333 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7334 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7335
7336 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7337 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7338 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7339 }
7340 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7341 {
7342 fFsw |= X86_FSW_ZE;
7343 if (fFcw & X86_FCW_ZM)
7344 {
7345 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7346 pFpuResTwo->r80Result2 = *pr80Val;
7347 }
7348 else
7349 {
7350 pFpuResTwo->r80Result2 = *pr80Val;
7351 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7352 }
7353 }
7354 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7355 {
7356 fFsw |= X86_FSW_DE;
7357 if (fFcw & X86_FCW_DM)
7358 {
7359 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7360 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7361 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7362 int32_t iExponent = -16382;
7363 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7364 {
7365 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7366 iExponent--;
7367 }
7368
7369 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7370 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7371 }
7372 else
7373 {
7374 pFpuResTwo->r80Result2 = *pr80Val;
7375 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7376 }
7377 }
7378 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7379 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7380 {
7381 pFpuResTwo->r80Result1 = *pr80Val;
7382 pFpuResTwo->r80Result2 = *pr80Val;
7383 }
7384 else if (RTFLOAT80U_IS_INF(pr80Val))
7385 {
7386 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7387 pFpuResTwo->r80Result2 = *pr80Val;
7388 }
7389 else
7390 {
7391 if (fFcw & X86_FCW_IM)
7392 {
7393 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7394 pFpuResTwo->r80Result1 = g_r80Indefinite;
7395 else
7396 {
7397 pFpuResTwo->r80Result1 = *pr80Val;
7398 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7399 }
7400 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7401 }
7402 else
7403 {
7404 pFpuResTwo->r80Result2 = *pr80Val;
7405 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7406 }
7407 fFsw |= X86_FSW_IE;
7408 }
7409 pFpuResTwo->FSW = fFsw;
7410}
7411#endif /* IEM_WITHOUT_ASSEMBLY */
7412
7413#if defined(IEM_WITHOUT_ASSEMBLY)
7414
7415static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7416{
7417 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7418 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7419 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7420 extFloat80_t v;
7421 (void)fFcw;
7422
7423 v = extF80_ylog2x(y, x, &SoftState);
7424 iemFpuSoftF80ToIprt(pr80Result, v);
7425
7426 return fFsw;
7427}
7428
7429IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7430 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7431{
7432 uint16_t const fFcw = pFpuState->FCW;
7433 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7434
7435 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7436 {
7437 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7438
7439 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7440 if (!(fFcw & X86_FCW_PM))
7441 fFsw |= X86_FSW_ES | X86_FSW_B;
7442 }
7443 else
7444 {
7445 fFsw |= X86_FSW_IE;
7446
7447 if (!(fFcw & X86_FCW_IM))
7448 {
7449 pFpuRes->r80Result = *pr80Val2;
7450 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7451 }
7452 else
7453 {
7454 pFpuRes->r80Result = g_r80Indefinite;
7455 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7456 }
7457 }
7458
7459 pFpuRes->FSW = fFsw;
7460}
7461#endif /* IEM_WITHOUT_ASSEMBLY */
7462
7463IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7464 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7465{
7466 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7467}
7468
7469IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7470 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7471{
7472 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7473}
7474
7475#if defined(IEM_WITHOUT_ASSEMBLY)
7476
7477static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7478{
7479 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7480 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7481 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7482 extFloat80_t v;
7483 (void)fFcw;
7484
7485 v = extF80_ylog2xp1(y, x, &SoftState);
7486 iemFpuSoftF80ToIprt(pr80Result, v);
7487
7488 return fFsw;
7489}
7490
7491IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7492 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7493{
7494 uint16_t const fFcw = pFpuState->FCW;
7495 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7496
7497 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7498 {
7499 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7500
7501 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7502 if (!(fFcw & X86_FCW_PM))
7503 fFsw |= X86_FSW_ES | X86_FSW_B;
7504 }
7505 else
7506 {
7507 fFsw |= X86_FSW_IE;
7508
7509 if (!(fFcw & X86_FCW_IM))
7510 {
7511 pFpuRes->r80Result = *pr80Val2;
7512 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7513 }
7514 else
7515 {
7516 pFpuRes->r80Result = g_r80Indefinite;
7517 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7518 }
7519 }
7520
7521 pFpuRes->FSW = fFsw;
7522}
7523
7524#endif /* IEM_WITHOUT_ASSEMBLY */
7525
7526IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7527 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7528{
7529 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7530}
7531
7532IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7533 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7534{
7535 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7536}
7537
7538
7539/*********************************************************************************************************************************
7540* MMX, SSE & AVX *
7541*********************************************************************************************************************************/
7542
7543/*
7544 * MOVSLDUP / VMOVSLDUP
7545 */
7546IEM_DECL_IMPL_DEF(void, iemAImpl_movsldup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7547{
7548 puDst->au32[0] = puSrc->au32[0];
7549 puDst->au32[1] = puSrc->au32[0];
7550 puDst->au32[2] = puSrc->au32[2];
7551 puDst->au32[3] = puSrc->au32[2];
7552}
7553
7554#ifdef IEM_WITH_VEX
7555
7556IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7557{
7558 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7559 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7560 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7561 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7562 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7563 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7564 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7565 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7566}
7567
7568
7569IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7570{
7571 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7572 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7573 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7574 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7575 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7576 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7577 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7578 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7579}
7580
7581#endif /* IEM_WITH_VEX */
7582
7583
7584/*
7585 * MOVSHDUP / VMOVSHDUP
7586 */
7587IEM_DECL_IMPL_DEF(void, iemAImpl_movshdup,(PRTUINT128U puDst, PCRTUINT128U puSrc))
7588{
7589 puDst->au32[0] = puSrc->au32[1];
7590 puDst->au32[1] = puSrc->au32[1];
7591 puDst->au32[2] = puSrc->au32[3];
7592 puDst->au32[3] = puSrc->au32[3];
7593}
7594
7595#ifdef IEM_WITH_VEX
7596
7597IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7598{
7599 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7600 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7601 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7602 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7603 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7604 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7605 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7606 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7607}
7608
7609
7610IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7611{
7612 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7613 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7614 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7615 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7616 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7617 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7618 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7619 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7620}
7621
7622#endif /* IEM_WITH_VEX */
7623
7624
7625/*
7626 * MOVDDUP / VMOVDDUP
7627 */
7628IEM_DECL_IMPL_DEF(void, iemAImpl_movddup,(PRTUINT128U puDst, uint64_t uSrc))
7629{
7630 puDst->au64[0] = uSrc;
7631 puDst->au64[1] = uSrc;
7632}
7633
7634#ifdef IEM_WITH_VEX
7635
7636IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7637{
7638 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7639 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7640 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7641 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7642}
7643
7644IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7645{
7646 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7647 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7648 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7649 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7650}
7651
7652#endif /* IEM_WITH_VEX */
7653
7654
7655/*
7656 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7657 */
7658#ifdef IEM_WITHOUT_ASSEMBLY
7659
7660IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7661{
7662 RT_NOREF(pFpuState);
7663 *puDst &= *puSrc;
7664}
7665
7666
7667IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7668{
7669 RT_NOREF(pFpuState);
7670 puDst->au64[0] &= puSrc->au64[0];
7671 puDst->au64[1] &= puSrc->au64[1];
7672}
7673
7674#endif
7675
7676IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7677 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7678{
7679 RT_NOREF(pExtState);
7680 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7681 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7682}
7683
7684
7685IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7686 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7687{
7688 RT_NOREF(pExtState);
7689 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7690 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7691 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7692 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7693}
7694
7695
7696/*
7697 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7698 */
7699#ifdef IEM_WITHOUT_ASSEMBLY
7700
7701IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7702{
7703 RT_NOREF(pFpuState);
7704 *puDst = ~*puDst & *puSrc;
7705}
7706
7707
7708IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7709{
7710 RT_NOREF(pFpuState);
7711 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7712 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7713}
7714
7715#endif
7716
7717IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7718 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7719{
7720 RT_NOREF(pExtState);
7721 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7722 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7723}
7724
7725
7726IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7727 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7728{
7729 RT_NOREF(pExtState);
7730 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7731 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7732 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7733 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7734}
7735
7736
7737/*
7738 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7739 */
7740#ifdef IEM_WITHOUT_ASSEMBLY
7741
7742IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7743{
7744 RT_NOREF(pFpuState);
7745 *puDst |= *puSrc;
7746}
7747
7748
7749IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7750{
7751 RT_NOREF(pFpuState);
7752 puDst->au64[0] |= puSrc->au64[0];
7753 puDst->au64[1] |= puSrc->au64[1];
7754}
7755
7756#endif
7757
7758IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7759 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7760{
7761 RT_NOREF(pExtState);
7762 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7763 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7764}
7765
7766
7767IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7768 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7769{
7770 RT_NOREF(pExtState);
7771 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7772 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7773 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7774 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7775}
7776
7777
7778/*
7779 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7780 */
7781#ifdef IEM_WITHOUT_ASSEMBLY
7782
7783IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7784{
7785 RT_NOREF(pFpuState);
7786 *puDst ^= *puSrc;
7787}
7788
7789
7790IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7791{
7792 RT_NOREF(pFpuState);
7793 puDst->au64[0] ^= puSrc->au64[0];
7794 puDst->au64[1] ^= puSrc->au64[1];
7795}
7796
7797#endif
7798
7799IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7800 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7801{
7802 RT_NOREF(pExtState);
7803 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7804 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7805}
7806
7807
7808IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7809 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7810{
7811 RT_NOREF(pExtState);
7812 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7813 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7814 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7815 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7816}
7817
7818
7819/*
7820 * PCMPEQB / VPCMPEQB
7821 */
7822#ifdef IEM_WITHOUT_ASSEMBLY
7823
7824IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7825{
7826 RT_NOREF(pFpuState);
7827 RTUINT64U uSrc1 = { *puDst };
7828 RTUINT64U uSrc2 = { *puSrc };
7829 RTUINT64U uDst;
7830 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7831 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7832 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7833 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7834 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7835 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7836 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7837 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7838 *puDst = uDst.u;
7839}
7840
7841
7842IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7843{
7844 RT_NOREF(pFpuState);
7845 RTUINT128U uSrc1 = *puDst;
7846 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7847 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7848 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7849 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7850 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7851 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7852 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7853 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7854 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7855 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7856 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7857 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7858 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7859 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7860 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7861 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7862}
7863
7864#endif
7865
7866IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7867 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7868{
7869 RT_NOREF(pExtState);
7870 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7871 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7872 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7873 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7874 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7875 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7876 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7877 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7878 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7879 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7880 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7881 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7882 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7883 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7884 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7885 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7886}
7887
7888IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7889 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7890{
7891 RT_NOREF(pExtState);
7892 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7893 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7894 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7895 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7896 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7897 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7898 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7899 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7900 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7901 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7902 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7903 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7904 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7905 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7906 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7907 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7908 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7909 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7910 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7911 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7912 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7913 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7914 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7915 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7916 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7917 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7918 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7919 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7920 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7921 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7922 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7923 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7924}
7925
7926
7927/*
7928 * PCMPEQW / VPCMPEQW
7929 */
7930#ifdef IEM_WITHOUT_ASSEMBLY
7931
7932IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7933{
7934 RT_NOREF(pFpuState);
7935 RTUINT64U uSrc1 = { *puDst };
7936 RTUINT64U uSrc2 = { *puSrc };
7937 RTUINT64U uDst;
7938 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7939 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7940 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7941 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7942 *puDst = uDst.u;
7943}
7944
7945
7946IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7947{
7948 RT_NOREF(pFpuState);
7949 RTUINT128U uSrc1 = *puDst;
7950 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7951 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7952 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7953 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7954 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7955 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7956 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7957 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7958}
7959
7960#endif
7961
7962IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7963 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7964{
7965 RT_NOREF(pExtState);
7966 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7967 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7968 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7969 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7970 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7971 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7972 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7973 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7974}
7975
7976IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7977 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7978{
7979 RT_NOREF(pExtState);
7980 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7981 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7982 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7983 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7984 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7985 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7986 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7987 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7988 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
7989 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
7990 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
7991 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
7992 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
7993 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
7994 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
7995 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
7996}
7997
7998
7999/*
8000 * PCMPEQD / VPCMPEQD.
8001 */
8002#ifdef IEM_WITHOUT_ASSEMBLY
8003
8004IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8005{
8006 RT_NOREF(pFpuState);
8007 RTUINT64U uSrc1 = { *puDst };
8008 RTUINT64U uSrc2 = { *puSrc };
8009 RTUINT64U uDst;
8010 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8011 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8012 *puDst = uDst.u;
8013}
8014
8015
8016IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8017{
8018 RT_NOREF(pFpuState);
8019 RTUINT128U uSrc1 = *puDst;
8020 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8021 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8022 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8023 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8024}
8025
8026#endif /* IEM_WITHOUT_ASSEMBLY */
8027
8028IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8029 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8030{
8031 RT_NOREF(pExtState);
8032 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8033 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8034 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8035 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8036}
8037
8038IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8039 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8040{
8041 RT_NOREF(pExtState);
8042 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8043 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8044 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8045 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8046 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8047 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8048 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8049 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8050}
8051
8052
8053/*
8054 * PCMPEQQ / VPCMPEQQ.
8055 */
8056IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8057{
8058 RT_NOREF(pFpuState);
8059 RTUINT128U uSrc1 = *puDst;
8060 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8061 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8062}
8063
8064IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8065 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8066{
8067 RT_NOREF(pExtState);
8068 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8069 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8070}
8071
8072IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8073 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8074{
8075 RT_NOREF(pExtState);
8076 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8077 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8078 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8079 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8080}
8081
8082
8083/*
8084 * PCMPGTB / VPCMPGTB
8085 */
8086#ifdef IEM_WITHOUT_ASSEMBLY
8087
8088IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8089{
8090 RT_NOREF(pFpuState);
8091 RTUINT64U uSrc1 = { *puDst };
8092 RTUINT64U uSrc2 = { *puSrc };
8093 RTUINT64U uDst;
8094 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8095 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8096 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8097 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8098 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8099 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8100 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8101 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8102 *puDst = uDst.u;
8103}
8104
8105
8106IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8107{
8108 RT_NOREF(pFpuState);
8109 RTUINT128U uSrc1 = *puDst;
8110 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8111 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8112 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8113 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8114 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8115 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8116 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8117 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8118 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8119 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8120 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8121 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8122 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8123 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8124 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8125 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8126}
8127
8128#endif
8129
8130IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8131 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8132{
8133 RT_NOREF(pExtState);
8134 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8135 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8136 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8137 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8138 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8139 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8140 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8141 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8142 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8143 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8144 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8145 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8146 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8147 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8148 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8149 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8150}
8151
8152IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8153 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8154{
8155 RT_NOREF(pExtState);
8156 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8157 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8158 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8159 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8160 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8161 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8162 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8163 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8164 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8165 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8166 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8167 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8168 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8169 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8170 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8171 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8172 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8173 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8174 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8175 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8176 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8177 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8178 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8179 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8180 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8181 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8182 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8183 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8184 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8185 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8186 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8187 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8188}
8189
8190
8191/*
8192 * PCMPGTW / VPCMPGTW
8193 */
8194#ifdef IEM_WITHOUT_ASSEMBLY
8195
8196IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8197{
8198 RT_NOREF(pFpuState);
8199 RTUINT64U uSrc1 = { *puDst };
8200 RTUINT64U uSrc2 = { *puSrc };
8201 RTUINT64U uDst;
8202 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8203 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8204 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8205 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8206 *puDst = uDst.u;
8207}
8208
8209
8210IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8211{
8212 RT_NOREF(pFpuState);
8213 RTUINT128U uSrc1 = *puDst;
8214 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8215 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8216 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8217 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8218 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8219 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8220 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8221 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8222}
8223
8224#endif
8225
8226IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8227 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8228{
8229 RT_NOREF(pExtState);
8230 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8231 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8232 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8233 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8234 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8235 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8236 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8237 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8238}
8239
8240IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8241 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8242{
8243 RT_NOREF(pExtState);
8244 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8245 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8246 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8247 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8248 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8249 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8250 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8251 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8252 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8253 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8254 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8255 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8256 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8257 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8258 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8259 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8260}
8261
8262
8263/*
8264 * PCMPGTD / VPCMPGTD.
8265 */
8266#ifdef IEM_WITHOUT_ASSEMBLY
8267
8268IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8269{
8270 RT_NOREF(pFpuState);
8271 RTUINT64U uSrc1 = { *puDst };
8272 RTUINT64U uSrc2 = { *puSrc };
8273 RTUINT64U uDst;
8274 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8275 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8276 *puDst = uDst.u;
8277}
8278
8279
8280IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8281{
8282 RT_NOREF(pFpuState);
8283 RTUINT128U uSrc1 = *puDst;
8284 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8285 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8286 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8287 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8288}
8289
8290#endif /* IEM_WITHOUT_ASSEMBLY */
8291
8292IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8293 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8294{
8295 RT_NOREF(pExtState);
8296 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8297 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8298 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8299 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8300}
8301
8302IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8303 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8304{
8305 RT_NOREF(pExtState);
8306 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8307 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8308 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8309 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8310 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8311 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8312 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8313 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8314}
8315
8316
8317/*
8318 * PCMPGTQ / VPCMPGTQ.
8319 */
8320IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8321{
8322 RT_NOREF(pFpuState);
8323 RTUINT128U uSrc1 = *puDst;
8324 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8325 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8326}
8327
8328IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8329 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8330{
8331 RT_NOREF(pExtState);
8332 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8333 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8334}
8335
8336IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8337 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8338{
8339 RT_NOREF(pExtState);
8340 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8341 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8342 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8343 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8344}
8345
8346
8347/*
8348 * PADDB / VPADDB
8349 */
8350#ifdef IEM_WITHOUT_ASSEMBLY
8351
8352IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8353{
8354 RT_NOREF(pFpuState);
8355 RTUINT64U uSrc1 = { *puDst };
8356 RTUINT64U uSrc2 = { *puSrc };
8357 RTUINT64U uDst;
8358 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8359 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8360 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8361 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8362 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8363 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8364 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8365 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8366 *puDst = uDst.u;
8367}
8368
8369
8370IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8371{
8372 RT_NOREF(pFpuState);
8373 RTUINT128U uSrc1 = *puDst;
8374 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8375 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8376 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8377 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8378 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8379 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8380 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8381 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8382 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8383 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8384 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8385 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8386 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8387 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8388 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8389 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8390}
8391
8392#endif
8393
8394
8395IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8396 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8397{
8398 RT_NOREF(pExtState);
8399 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8400 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8401 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8402 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8403 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8404 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8405 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8406 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8407 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8408 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8409 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8410 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8411 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8412 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8413 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8414 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8415}
8416
8417IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8418 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8419{
8420 RT_NOREF(pExtState);
8421 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8422 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8423 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8424 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8425 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8426 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8427 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8428 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8429 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8430 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8431 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8432 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8433 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8434 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8435 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8436 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8437 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8438 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8439 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8440 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8441 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8442 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8443 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8444 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8445 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8446 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8447 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8448 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8449 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8450 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8451 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8452 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8453}
8454
8455
8456/*
8457 * PADDSB / VPADDSB
8458 */
8459#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8460 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8461 ? (uint8_t)(a_iWord) \
8462 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8463
8464#ifdef IEM_WITHOUT_ASSEMBLY
8465
8466IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8467{
8468 RT_NOREF(pFpuState);
8469 RTUINT64U uSrc1 = { *puDst };
8470 RTUINT64U uSrc2 = { *puSrc };
8471 RTUINT64U uDst;
8472 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8473 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8474 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8475 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8476 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8477 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8478 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8479 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8480 *puDst = uDst.u;
8481}
8482
8483
8484IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8485{
8486 RT_NOREF(pFpuState);
8487 RTUINT128U uSrc1 = *puDst;
8488 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8489 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8490 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8491 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8492 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8493 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8494 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8495 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8496 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8497 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8498 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8499 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8500 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8501 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8502 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8503 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8504}
8505
8506#endif
8507
8508
8509/*
8510 * PADDSB / VPADDSB
8511 */
8512#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8513 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8514 ? (uint8_t)(a_uWord) \
8515 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8516
8517#ifdef IEM_WITHOUT_ASSEMBLY
8518
8519IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8520{
8521 RT_NOREF(pFpuState);
8522 RTUINT64U uSrc1 = { *puDst };
8523 RTUINT64U uSrc2 = { *puSrc };
8524 RTUINT64U uDst;
8525 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8526 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8527 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8528 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8529 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8530 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8531 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8532 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8533 *puDst = uDst.u;
8534}
8535
8536
8537IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8538{
8539 RT_NOREF(pFpuState);
8540 RTUINT128U uSrc1 = *puDst;
8541 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8542 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8543 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8544 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8545 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8546 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8547 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8548 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8549 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8550 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8551 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8552 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8553 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8554 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8555 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8556 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8557}
8558
8559#endif
8560
8561
8562/*
8563 * PADDW / VPADDW
8564 */
8565#ifdef IEM_WITHOUT_ASSEMBLY
8566
8567IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8568{
8569 RT_NOREF(pFpuState);
8570 RTUINT64U uSrc1 = { *puDst };
8571 RTUINT64U uSrc2 = { *puSrc };
8572 RTUINT64U uDst;
8573 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8574 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8575 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8576 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8577 *puDst = uDst.u;
8578}
8579
8580
8581IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8582{
8583 RT_NOREF(pFpuState);
8584 RTUINT128U uSrc1 = *puDst;
8585 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8586 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8587 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8588 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8589 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8590 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8591 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8592 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8593}
8594
8595#endif
8596
8597
8598IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8599 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8600{
8601 RT_NOREF(pExtState);
8602 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8603 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8604 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8605 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8606 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8607 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8608 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8609 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8610}
8611
8612IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8613 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8614{
8615 RT_NOREF(pExtState);
8616 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8617 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8618 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8619 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8620 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8621 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8622 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8623 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8624 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8625 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8626 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8627 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8628 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8629 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8630 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8631 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8632}
8633
8634
8635/*
8636 * PADDSW / VPADDSW
8637 */
8638#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8639 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8640 ? (uint16_t)(a_iDword) \
8641 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8642
8643#ifdef IEM_WITHOUT_ASSEMBLY
8644
8645IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8646{
8647 RT_NOREF(pFpuState);
8648 RTUINT64U uSrc1 = { *puDst };
8649 RTUINT64U uSrc2 = { *puSrc };
8650 RTUINT64U uDst;
8651 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8652 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8653 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8654 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8655 *puDst = uDst.u;
8656}
8657
8658
8659IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8660{
8661 RT_NOREF(pFpuState);
8662 RTUINT128U uSrc1 = *puDst;
8663 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8664 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8665 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8666 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8667 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8668 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8669 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8670 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8671}
8672
8673#endif
8674
8675
8676/*
8677 * PADDUSW / VPADDUSW
8678 */
8679#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8680 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8681 ? (uint16_t)(a_uDword) \
8682 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8683
8684#ifdef IEM_WITHOUT_ASSEMBLY
8685
8686IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8687{
8688 RT_NOREF(pFpuState);
8689 RTUINT64U uSrc1 = { *puDst };
8690 RTUINT64U uSrc2 = { *puSrc };
8691 RTUINT64U uDst;
8692 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8693 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8694 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8695 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8696 *puDst = uDst.u;
8697}
8698
8699
8700IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8701{
8702 RT_NOREF(pFpuState);
8703 RTUINT128U uSrc1 = *puDst;
8704 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8705 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8706 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8707 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8708 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8709 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8710 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8711 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8712}
8713
8714#endif
8715
8716
8717/*
8718 * PADDD / VPADDD.
8719 */
8720#ifdef IEM_WITHOUT_ASSEMBLY
8721
8722IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8723{
8724 RT_NOREF(pFpuState);
8725 RTUINT64U uSrc1 = { *puDst };
8726 RTUINT64U uSrc2 = { *puSrc };
8727 RTUINT64U uDst;
8728 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8729 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8730 *puDst = uDst.u;
8731}
8732
8733
8734IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8735{
8736 RT_NOREF(pFpuState);
8737 RTUINT128U uSrc1 = *puDst;
8738 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8739 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8740 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8741 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8742}
8743
8744#endif /* IEM_WITHOUT_ASSEMBLY */
8745
8746IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8747 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8748{
8749 RT_NOREF(pExtState);
8750 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8751 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8752 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8753 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8754}
8755
8756IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8757 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8758{
8759 RT_NOREF(pExtState);
8760 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8761 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8762 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8763 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8764 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8765 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8766 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8767 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8768}
8769
8770
8771/*
8772 * PADDQ / VPADDQ.
8773 */
8774#ifdef IEM_WITHOUT_ASSEMBLY
8775
8776IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8777{
8778 RT_NOREF(pFpuState);
8779 *puDst = *puDst + *puSrc;
8780}
8781
8782IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8783{
8784 RT_NOREF(pFpuState);
8785 RTUINT128U uSrc1 = *puDst;
8786 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8787 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8788}
8789
8790#endif
8791
8792IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8793 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8794{
8795 RT_NOREF(pExtState);
8796 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8797 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8798}
8799
8800IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8801 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8802{
8803 RT_NOREF(pExtState);
8804 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8805 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8806 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8807 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8808}
8809
8810
8811/*
8812 * PSUBB / VPSUBB
8813 */
8814#ifdef IEM_WITHOUT_ASSEMBLY
8815
8816IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8817{
8818 RT_NOREF(pFpuState);
8819 RTUINT64U uSrc1 = { *puDst };
8820 RTUINT64U uSrc2 = { *puSrc };
8821 RTUINT64U uDst;
8822 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8823 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8824 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8825 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8826 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8827 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8828 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8829 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8830 *puDst = uDst.u;
8831}
8832
8833
8834IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8835{
8836 RT_NOREF(pFpuState);
8837 RTUINT128U uSrc1 = *puDst;
8838 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8839 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8840 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8841 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8842 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8843 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8844 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8845 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8846 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8847 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8848 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8849 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8850 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8851 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8852 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8853 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8854}
8855
8856#endif
8857
8858IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8859 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8860{
8861 RT_NOREF(pExtState);
8862 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8863 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8864 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8865 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8866 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8867 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8868 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8869 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8870 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8871 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8872 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8873 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8874 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8875 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8876 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8877 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8878}
8879
8880IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8881 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8882{
8883 RT_NOREF(pExtState);
8884 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8885 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8886 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8887 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8888 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8889 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8890 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8891 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8892 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8893 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8894 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8895 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8896 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8897 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8898 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8899 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8900 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
8901 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
8902 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
8903 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
8904 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
8905 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
8906 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
8907 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
8908 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
8909 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
8910 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
8911 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
8912 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
8913 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
8914 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
8915 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
8916}
8917
8918
8919/*
8920 * PSUBSB / VSUBSB
8921 */
8922#ifdef IEM_WITHOUT_ASSEMBLY
8923
8924IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8925{
8926 RT_NOREF(pFpuState);
8927 RTUINT64U uSrc1 = { *puDst };
8928 RTUINT64U uSrc2 = { *puSrc };
8929 RTUINT64U uDst;
8930 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
8931 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
8932 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
8933 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
8934 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
8935 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
8936 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
8937 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
8938 *puDst = uDst.u;
8939}
8940
8941
8942IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8943{
8944 RT_NOREF(pFpuState);
8945 RTUINT128U uSrc1 = *puDst;
8946 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
8947 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
8948 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
8949 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
8950 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
8951 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
8952 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
8953 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
8954 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
8955 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
8956 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
8957 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
8958 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
8959 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
8960 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
8961 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
8962}
8963
8964#endif
8965
8966
8967/*
8968 * PADDSB / VPADDSB
8969 */
8970#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
8971 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8972 ? (uint8_t)(a_uWord) \
8973 : (uint8_t)0 )
8974
8975#ifdef IEM_WITHOUT_ASSEMBLY
8976
8977IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8978{
8979 RT_NOREF(pFpuState);
8980 RTUINT64U uSrc1 = { *puDst };
8981 RTUINT64U uSrc2 = { *puSrc };
8982 RTUINT64U uDst;
8983 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
8984 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
8985 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
8986 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
8987 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
8988 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
8989 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
8990 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
8991 *puDst = uDst.u;
8992}
8993
8994
8995IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8996{
8997 RT_NOREF(pFpuState);
8998 RTUINT128U uSrc1 = *puDst;
8999 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9000 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9001 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9002 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9003 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9004 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9005 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9006 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9007 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9008 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9009 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9010 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9011 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9012 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9013 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9014 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9015}
9016
9017#endif
9018
9019
9020/*
9021 * PSUBW / VPSUBW
9022 */
9023#ifdef IEM_WITHOUT_ASSEMBLY
9024
9025IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9026{
9027 RT_NOREF(pFpuState);
9028 RTUINT64U uSrc1 = { *puDst };
9029 RTUINT64U uSrc2 = { *puSrc };
9030 RTUINT64U uDst;
9031 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9032 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9033 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9034 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9035 *puDst = uDst.u;
9036}
9037
9038
9039IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9040{
9041 RT_NOREF(pFpuState);
9042 RTUINT128U uSrc1 = *puDst;
9043 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9044 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9045 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9046 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9047 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9048 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9049 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9050 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9051}
9052
9053#endif
9054
9055IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9056 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9057{
9058 RT_NOREF(pExtState);
9059 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9060 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9061 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9062 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9063 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9064 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9065 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9066 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9067}
9068
9069IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9070 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9071{
9072 RT_NOREF(pExtState);
9073 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9074 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9075 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9076 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9077 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9078 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9079 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9080 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9081 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9082 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9083 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9084 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9085 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9086 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9087 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9088 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9089}
9090
9091
9092/*
9093 * PSUBSW / VPSUBSW
9094 */
9095#ifdef IEM_WITHOUT_ASSEMBLY
9096
9097IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9098{
9099 RT_NOREF(pFpuState);
9100 RTUINT64U uSrc1 = { *puDst };
9101 RTUINT64U uSrc2 = { *puSrc };
9102 RTUINT64U uDst;
9103 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9104 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9105 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9106 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9107 *puDst = uDst.u;
9108}
9109
9110
9111IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9112{
9113 RT_NOREF(pFpuState);
9114 RTUINT128U uSrc1 = *puDst;
9115 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9116 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9117 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9118 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9119 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9120 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9121 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9122 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9123}
9124
9125#endif
9126
9127
9128/*
9129 * PSUBUSW / VPSUBUSW
9130 */
9131#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9132 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9133 ? (uint16_t)(a_uDword) \
9134 : (uint16_t)0 )
9135
9136#ifdef IEM_WITHOUT_ASSEMBLY
9137
9138IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9139{
9140 RT_NOREF(pFpuState);
9141 RTUINT64U uSrc1 = { *puDst };
9142 RTUINT64U uSrc2 = { *puSrc };
9143 RTUINT64U uDst;
9144 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9145 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9146 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9147 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9148 *puDst = uDst.u;
9149}
9150
9151
9152IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9153{
9154 RT_NOREF(pFpuState);
9155 RTUINT128U uSrc1 = *puDst;
9156 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9157 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9158 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9159 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9160 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9161 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9162 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9163 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9164}
9165
9166#endif
9167
9168
9169/*
9170 * PSUBD / VPSUBD.
9171 */
9172#ifdef IEM_WITHOUT_ASSEMBLY
9173
9174IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9175{
9176 RT_NOREF(pFpuState);
9177 RTUINT64U uSrc1 = { *puDst };
9178 RTUINT64U uSrc2 = { *puSrc };
9179 RTUINT64U uDst;
9180 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9181 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9182 *puDst = uDst.u;
9183}
9184
9185
9186IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9187{
9188 RT_NOREF(pFpuState);
9189 RTUINT128U uSrc1 = *puDst;
9190 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9191 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9192 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9193 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9194}
9195
9196#endif /* IEM_WITHOUT_ASSEMBLY */
9197
9198IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9199 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9200{
9201 RT_NOREF(pExtState);
9202 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9203 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9204 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9205 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9206}
9207
9208IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9209 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9210{
9211 RT_NOREF(pExtState);
9212 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9213 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9214 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9215 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9216 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9217 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9218 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9219 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9220}
9221
9222
9223/*
9224 * PSUBQ / VPSUBQ.
9225 */
9226#ifdef IEM_WITHOUT_ASSEMBLY
9227
9228IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9229{
9230 RT_NOREF(pFpuState);
9231 *puDst = *puDst - *puSrc;
9232}
9233
9234IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9235{
9236 RT_NOREF(pFpuState);
9237 RTUINT128U uSrc1 = *puDst;
9238 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9239 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9240}
9241
9242#endif
9243
9244IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9245 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9246{
9247 RT_NOREF(pExtState);
9248 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9249 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9250}
9251
9252IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9253 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9254{
9255 RT_NOREF(pExtState);
9256 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9257 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9258 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9259 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9260}
9261
9262
9263
9264/*
9265 * PMULLW / VPMULLW / PMULLD / VPMULLD
9266 */
9267#ifdef IEM_WITHOUT_ASSEMBLY
9268
9269IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9270{
9271 RT_NOREF(pFpuState);
9272 RTUINT64U uSrc1 = { *puDst };
9273 RTUINT64U uSrc2 = { *puSrc };
9274 RTUINT64U uDst;
9275 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9276 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9277 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9278 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9279 *puDst = uDst.u;
9280}
9281
9282
9283IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9284{
9285 RT_NOREF(pFpuState);
9286 RTUINT128U uSrc1 = *puDst;
9287 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9288 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9289 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9290 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9291 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9292 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9293 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9294 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9295}
9296
9297#endif
9298
9299IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9300{
9301 RTUINT128U uSrc1 = *puDst;
9302
9303 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9304 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9305 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9306 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9307 RT_NOREF(pFpuState);
9308}
9309
9310
9311IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9312{
9313 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9314 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9315 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9316 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9317 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9318 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9319 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9320 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9321}
9322
9323
9324IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9325{
9326 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9327 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9328 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9329 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9330 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9331 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9332 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9333 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9334 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9335 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9336 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9337 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9338 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9339 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9340 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9341 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9342}
9343
9344
9345IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9346{
9347 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9348 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9349 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9350 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9351}
9352
9353
9354IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9355{
9356 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9357 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9358 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9359 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9360 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9361 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9362 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9363 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9364}
9365
9366
9367/*
9368 * PMULHW / VPMULHW
9369 */
9370#ifdef IEM_WITHOUT_ASSEMBLY
9371
9372IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9373{
9374 RT_NOREF(pFpuState);
9375 RTUINT64U uSrc1 = { *puDst };
9376 RTUINT64U uSrc2 = { *puSrc };
9377 RTUINT64U uDst;
9378 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9379 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9380 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9381 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9382 *puDst = uDst.u;
9383}
9384
9385
9386IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9387{
9388 RT_NOREF(pFpuState);
9389 RTUINT128U uSrc1 = *puDst;
9390 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9391 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9392 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9393 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9394 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9395 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9396 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9397 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9398}
9399
9400#endif
9401
9402IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9403{
9404 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9405 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9406 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9407 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9408 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9409 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9410 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9411 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9412}
9413
9414
9415IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9416{
9417 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9418 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9419 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9420 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9421 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9422 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9423 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9424 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9425 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9426 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9427 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9428 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9429 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9430 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9431 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9432 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9433}
9434
9435
9436/*
9437 * PMULHUW / VPMULHUW
9438 */
9439#ifdef IEM_WITHOUT_ASSEMBLY
9440
9441IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9442{
9443 RTUINT64U uSrc1 = { *puDst };
9444 RTUINT64U uSrc2 = { *puSrc };
9445 RTUINT64U uDst;
9446 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9447 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9448 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9449 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9450 *puDst = uDst.u;
9451}
9452
9453
9454IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9455{
9456 RTUINT128U uSrc1 = *puDst;
9457 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9458 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9459 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9460 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9461 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9462 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9463 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9464 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9465}
9466
9467#endif
9468
9469IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9470{
9471 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9472 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9473 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9474 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9475 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9476 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9477 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9478 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9479}
9480
9481
9482IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9483{
9484 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9485 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9486 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9487 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9488 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9489 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9490 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9491 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9492 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9493 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9494 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9495 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9496 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9497 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9498 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9499 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9500}
9501
9502
9503/*
9504 * PSRLW / VPSRLW
9505 */
9506#ifdef IEM_WITHOUT_ASSEMBLY
9507
9508IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9509{
9510 RTUINT64U uSrc1 = { *puDst };
9511 RTUINT64U uSrc2 = { *puSrc };
9512 RTUINT64U uDst;
9513
9514 if (uSrc2.au64[0] <= 15)
9515 {
9516 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9517 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9518 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9519 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9520 }
9521 else
9522 {
9523 uDst.au64[0] = 0;
9524 }
9525 *puDst = uDst.u;
9526}
9527
9528
9529IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9530{
9531 RTUINT64U uSrc1 = { *puDst };
9532 RTUINT64U uDst;
9533
9534 if (uShift <= 15)
9535 {
9536 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9537 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9538 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9539 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9540 }
9541 else
9542 {
9543 uDst.au64[0] = 0;
9544 }
9545 *puDst = uDst.u;
9546}
9547
9548
9549IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9550{
9551 RTUINT128U uSrc1 = *puDst;
9552
9553 if (puSrc->au64[0] <= 15)
9554 {
9555 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9556 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9557 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9558 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9559 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9560 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9561 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9562 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9563 }
9564 else
9565 {
9566 puDst->au64[0] = 0;
9567 puDst->au64[1] = 0;
9568 }
9569}
9570
9571IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9572{
9573 RTUINT128U uSrc1 = *puDst;
9574
9575 if (uShift <= 15)
9576 {
9577 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9578 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9579 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9580 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9581 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9582 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9583 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9584 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9585 }
9586 else
9587 {
9588 puDst->au64[0] = 0;
9589 puDst->au64[1] = 0;
9590 }
9591}
9592
9593#endif
9594
9595
9596/*
9597 * PSRAW / VPSRAW
9598 */
9599#ifdef IEM_WITHOUT_ASSEMBLY
9600
9601IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9602{
9603 RTUINT64U uSrc1 = { *puDst };
9604 RTUINT64U uSrc2 = { *puSrc };
9605 RTUINT64U uDst;
9606
9607 if (uSrc2.au64[0] <= 15)
9608 {
9609 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9610 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9611 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9612 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9613 }
9614 else
9615 {
9616 uDst.au64[0] = 0;
9617 }
9618 *puDst = uDst.u;
9619}
9620
9621
9622IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9623{
9624 RTUINT64U uSrc1 = { *puDst };
9625 RTUINT64U uDst;
9626
9627 if (uShift <= 15)
9628 {
9629 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9630 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9631 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9632 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9633 }
9634 else
9635 {
9636 uDst.au64[0] = 0;
9637 }
9638 *puDst = uDst.u;
9639}
9640
9641
9642IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9643{
9644 RTUINT128U uSrc1 = *puDst;
9645
9646 if (puSrc->au64[0] <= 15)
9647 {
9648 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9649 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9650 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9651 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9652 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9653 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9654 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9655 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9656 }
9657 else
9658 {
9659 puDst->au64[0] = 0;
9660 puDst->au64[1] = 0;
9661 }
9662}
9663
9664IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9665{
9666 RTUINT128U uSrc1 = *puDst;
9667
9668 if (uShift <= 15)
9669 {
9670 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9671 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9672 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9673 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9674 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9675 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9676 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9677 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9678 }
9679 else
9680 {
9681 puDst->au64[0] = 0;
9682 puDst->au64[1] = 0;
9683 }
9684}
9685
9686#endif
9687
9688
9689/*
9690 * PSLLW / VPSLLW
9691 */
9692#ifdef IEM_WITHOUT_ASSEMBLY
9693
9694IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9695{
9696 RTUINT64U uSrc1 = { *puDst };
9697 RTUINT64U uSrc2 = { *puSrc };
9698 RTUINT64U uDst;
9699
9700 if (uSrc2.au64[0] <= 15)
9701 {
9702 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
9703 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
9704 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
9705 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
9706 }
9707 else
9708 {
9709 uDst.au64[0] = 0;
9710 }
9711 *puDst = uDst.u;
9712}
9713
9714
9715IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9716{
9717 RTUINT64U uSrc1 = { *puDst };
9718 RTUINT64U uDst;
9719
9720 if (uShift <= 15)
9721 {
9722 uDst.au16[0] = uSrc1.au16[0] << uShift;
9723 uDst.au16[1] = uSrc1.au16[1] << uShift;
9724 uDst.au16[2] = uSrc1.au16[2] << uShift;
9725 uDst.au16[3] = uSrc1.au16[3] << uShift;
9726 }
9727 else
9728 {
9729 uDst.au64[0] = 0;
9730 }
9731 *puDst = uDst.u;
9732}
9733
9734
9735IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9736{
9737 RTUINT128U uSrc1 = *puDst;
9738
9739 if (puSrc->au64[0] <= 15)
9740 {
9741 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
9742 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
9743 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
9744 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
9745 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
9746 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
9747 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
9748 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
9749 }
9750 else
9751 {
9752 puDst->au64[0] = 0;
9753 puDst->au64[1] = 0;
9754 }
9755}
9756
9757IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9758{
9759 RTUINT128U uSrc1 = *puDst;
9760
9761 if (uShift <= 15)
9762 {
9763 puDst->au16[0] = uSrc1.au16[0] << uShift;
9764 puDst->au16[1] = uSrc1.au16[1] << uShift;
9765 puDst->au16[2] = uSrc1.au16[2] << uShift;
9766 puDst->au16[3] = uSrc1.au16[3] << uShift;
9767 puDst->au16[4] = uSrc1.au16[4] << uShift;
9768 puDst->au16[5] = uSrc1.au16[5] << uShift;
9769 puDst->au16[6] = uSrc1.au16[6] << uShift;
9770 puDst->au16[7] = uSrc1.au16[7] << uShift;
9771 }
9772 else
9773 {
9774 puDst->au64[0] = 0;
9775 puDst->au64[1] = 0;
9776 }
9777}
9778
9779#endif
9780
9781
9782/*
9783 * PSRLD / VPSRLD
9784 */
9785#ifdef IEM_WITHOUT_ASSEMBLY
9786
9787IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9788{
9789 RTUINT64U uSrc1 = { *puDst };
9790 RTUINT64U uSrc2 = { *puSrc };
9791 RTUINT64U uDst;
9792
9793 if (uSrc2.au64[0] <= 31)
9794 {
9795 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
9796 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
9797 }
9798 else
9799 {
9800 uDst.au64[0] = 0;
9801 }
9802 *puDst = uDst.u;
9803}
9804
9805
9806IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9807{
9808 RTUINT64U uSrc1 = { *puDst };
9809 RTUINT64U uDst;
9810
9811 if (uShift <= 31)
9812 {
9813 uDst.au32[0] = uSrc1.au32[0] >> uShift;
9814 uDst.au32[1] = uSrc1.au32[1] >> uShift;
9815 }
9816 else
9817 {
9818 uDst.au64[0] = 0;
9819 }
9820 *puDst = uDst.u;
9821}
9822
9823
9824IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9825{
9826 RTUINT128U uSrc1 = *puDst;
9827
9828 if (puSrc->au64[0] <= 31)
9829 {
9830 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
9831 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
9832 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
9833 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
9834 }
9835 else
9836 {
9837 puDst->au64[0] = 0;
9838 puDst->au64[1] = 0;
9839 }
9840}
9841
9842IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9843{
9844 RTUINT128U uSrc1 = *puDst;
9845
9846 if (uShift <= 31)
9847 {
9848 puDst->au32[0] = uSrc1.au32[0] >> uShift;
9849 puDst->au32[1] = uSrc1.au32[1] >> uShift;
9850 puDst->au32[2] = uSrc1.au32[2] >> uShift;
9851 puDst->au32[3] = uSrc1.au32[3] >> uShift;
9852 }
9853 else
9854 {
9855 puDst->au64[0] = 0;
9856 puDst->au64[1] = 0;
9857 }
9858}
9859
9860#endif
9861
9862
9863/*
9864 * PSRAD / VPSRAD
9865 */
9866#ifdef IEM_WITHOUT_ASSEMBLY
9867
9868IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
9869{
9870 RTUINT64U uSrc1 = { *puDst };
9871 RTUINT64U uSrc2 = { *puSrc };
9872 RTUINT64U uDst;
9873
9874 if (uSrc2.au64[0] <= 31)
9875 {
9876 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
9877 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
9878 }
9879 else
9880 {
9881 uDst.au64[0] = 0;
9882 }
9883 *puDst = uDst.u;
9884}
9885
9886
9887IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
9888{
9889 RTUINT64U uSrc1 = { *puDst };
9890 RTUINT64U uDst;
9891
9892 if (uShift <= 31)
9893 {
9894 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
9895 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
9896 }
9897 else
9898 {
9899 uDst.au64[0] = 0;
9900 }
9901 *puDst = uDst.u;
9902}
9903
9904
9905IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9906{
9907 RTUINT128U uSrc1 = *puDst;
9908
9909 if (puSrc->au64[0] <= 31)
9910 {
9911 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
9912 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
9913 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
9914 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
9915 }
9916 else
9917 {
9918 puDst->au64[0] = 0;
9919 puDst->au64[1] = 0;
9920 }
9921}
9922
9923IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9924{
9925 RTUINT128U uSrc1 = *puDst;
9926
9927 if (uShift <= 31)
9928 {
9929 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
9930 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
9931 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
9932 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
9933 }
9934 else
9935 {
9936 puDst->au64[0] = 0;
9937 puDst->au64[1] = 0;
9938 }
9939}
9940
9941#endif
9942
9943
9944/*
9945 * PSLLD / VPSLLD
9946 */
9947#ifdef IEM_WITHOUT_ASSEMBLY
9948
9949IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
9950{
9951 RTUINT64U uSrc1 = { *puDst };
9952 RTUINT64U uSrc2 = { *puSrc };
9953 RTUINT64U uDst;
9954
9955 if (uSrc2.au64[0] <= 31)
9956 {
9957 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
9958 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
9959 }
9960 else
9961 {
9962 uDst.au64[0] = 0;
9963 }
9964 *puDst = uDst.u;
9965}
9966
9967
9968IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
9969{
9970 RTUINT64U uSrc1 = { *puDst };
9971 RTUINT64U uDst;
9972
9973 if (uShift <= 31)
9974 {
9975 uDst.au32[0] = uSrc1.au32[0] << uShift;
9976 uDst.au32[1] = uSrc1.au32[1] << uShift;
9977 }
9978 else
9979 {
9980 uDst.au64[0] = 0;
9981 }
9982 *puDst = uDst.u;
9983}
9984
9985
9986IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9987{
9988 RTUINT128U uSrc1 = *puDst;
9989
9990 if (puSrc->au64[0] <= 31)
9991 {
9992 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
9993 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
9994 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
9995 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
9996 }
9997 else
9998 {
9999 puDst->au64[0] = 0;
10000 puDst->au64[1] = 0;
10001 }
10002}
10003
10004IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10005{
10006 RTUINT128U uSrc1 = *puDst;
10007
10008 if (uShift <= 31)
10009 {
10010 puDst->au32[0] = uSrc1.au32[0] << uShift;
10011 puDst->au32[1] = uSrc1.au32[1] << uShift;
10012 puDst->au32[2] = uSrc1.au32[2] << uShift;
10013 puDst->au32[3] = uSrc1.au32[3] << uShift;
10014 }
10015 else
10016 {
10017 puDst->au64[0] = 0;
10018 puDst->au64[1] = 0;
10019 }
10020}
10021
10022#endif
10023
10024
10025/*
10026 * PSRLQ / VPSRLQ
10027 */
10028#ifdef IEM_WITHOUT_ASSEMBLY
10029
10030IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10031{
10032 RTUINT64U uSrc1 = { *puDst };
10033 RTUINT64U uSrc2 = { *puSrc };
10034 RTUINT64U uDst;
10035
10036 if (uSrc2.au64[0] <= 63)
10037 {
10038 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10039 }
10040 else
10041 {
10042 uDst.au64[0] = 0;
10043 }
10044 *puDst = uDst.u;
10045}
10046
10047
10048IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10049{
10050 RTUINT64U uSrc1 = { *puDst };
10051 RTUINT64U uDst;
10052
10053 if (uShift <= 63)
10054 {
10055 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10056 }
10057 else
10058 {
10059 uDst.au64[0] = 0;
10060 }
10061 *puDst = uDst.u;
10062}
10063
10064
10065IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10066{
10067 RTUINT128U uSrc1 = *puDst;
10068
10069 if (puSrc->au64[0] <= 63)
10070 {
10071 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10072 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10073 }
10074 else
10075 {
10076 puDst->au64[0] = 0;
10077 puDst->au64[1] = 0;
10078 }
10079}
10080
10081IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10082{
10083 RTUINT128U uSrc1 = *puDst;
10084
10085 if (uShift <= 63)
10086 {
10087 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10088 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10089 }
10090 else
10091 {
10092 puDst->au64[0] = 0;
10093 puDst->au64[1] = 0;
10094 }
10095}
10096
10097#endif
10098
10099
10100/*
10101 * PSLLQ / VPSLLQ
10102 */
10103#ifdef IEM_WITHOUT_ASSEMBLY
10104
10105IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10106{
10107 RTUINT64U uSrc1 = { *puDst };
10108 RTUINT64U uSrc2 = { *puSrc };
10109 RTUINT64U uDst;
10110
10111 if (uSrc2.au64[0] <= 63)
10112 {
10113 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10114 }
10115 else
10116 {
10117 uDst.au64[0] = 0;
10118 }
10119 *puDst = uDst.u;
10120}
10121
10122
10123IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10124{
10125 RTUINT64U uSrc1 = { *puDst };
10126 RTUINT64U uDst;
10127
10128 if (uShift <= 63)
10129 {
10130 uDst.au64[0] = uSrc1.au64[0] << uShift;
10131 }
10132 else
10133 {
10134 uDst.au64[0] = 0;
10135 }
10136 *puDst = uDst.u;
10137}
10138
10139
10140IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10141{
10142 RTUINT128U uSrc1 = *puDst;
10143
10144 if (puSrc->au64[0] <= 63)
10145 {
10146 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10147 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10148 }
10149 else
10150 {
10151 puDst->au64[0] = 0;
10152 puDst->au64[1] = 0;
10153 }
10154}
10155
10156IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10157{
10158 RTUINT128U uSrc1 = *puDst;
10159
10160 if (uShift <= 63)
10161 {
10162 puDst->au64[0] = uSrc1.au64[0] << uShift;
10163 puDst->au64[1] = uSrc1.au64[1] << uShift;
10164 }
10165 else
10166 {
10167 puDst->au64[0] = 0;
10168 puDst->au64[1] = 0;
10169 }
10170}
10171
10172#endif
10173
10174
10175/*
10176 * PSRLDQ / VPSRLDQ
10177 */
10178#ifdef IEM_WITHOUT_ASSEMBLY
10179
10180IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10181{
10182 RTUINT128U uSrc1 = *puDst;
10183
10184 if (uShift < 16)
10185 {
10186 int i;
10187
10188 for (i = 0; i < 16 - uShift; ++i)
10189 puDst->au8[i] = uSrc1.au8[i + uShift];
10190 for (i = 16 - uShift; i < 16; ++i)
10191 puDst->au8[i] = 0;
10192 }
10193 else
10194 {
10195 puDst->au64[0] = 0;
10196 puDst->au64[1] = 0;
10197 }
10198}
10199
10200#endif
10201
10202
10203/*
10204 * PSLLDQ / VPSLLDQ
10205 */
10206#ifdef IEM_WITHOUT_ASSEMBLY
10207
10208IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10209{
10210 RTUINT128U uSrc1 = *puDst;
10211
10212 if (uShift < 16)
10213 {
10214 int i;
10215
10216 for (i = 0; i < uShift; ++i)
10217 puDst->au8[i] = 0;
10218 for (i = uShift; i < 16; ++i)
10219 puDst->au8[i] = uSrc1.au8[i - uShift];
10220 }
10221 else
10222 {
10223 puDst->au64[0] = 0;
10224 puDst->au64[1] = 0;
10225 }
10226}
10227
10228#endif
10229
10230
10231/*
10232 * PMADDWD / VPMADDWD
10233 */
10234#ifdef IEM_WITHOUT_ASSEMBLY
10235
10236IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10237{
10238 RTUINT64U uSrc1 = { *puDst };
10239 RTUINT64U uSrc2 = { *puSrc };
10240 RTUINT64U uDst;
10241
10242 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10243 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10244 *puDst = uDst.u;
10245 RT_NOREF(pFpuState);
10246}
10247
10248
10249IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10250{
10251 RTUINT128U uSrc1 = *puDst;
10252
10253 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10254 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10255 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10256 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10257 RT_NOREF(pFpuState);
10258}
10259
10260#endif
10261
10262
10263/*
10264 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10265 */
10266#ifdef IEM_WITHOUT_ASSEMBLY
10267
10268IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10269{
10270 RTUINT64U uSrc1 = { *puDst };
10271 RTUINT64U uSrc2 = { *puSrc };
10272 RTUINT64U uDst;
10273
10274 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10275 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10276 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10277 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10278 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10279 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10280 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10281 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10282 *puDst = uDst.u;
10283 RT_NOREF(pFpuState);
10284}
10285
10286
10287IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10288{
10289 RTUINT128U uSrc1 = *puDst;
10290
10291 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10292 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10293 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10294 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10295 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10296 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10297 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10298 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10299 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10300 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10301 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10302 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10303 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10304 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10305 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10306 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10307 RT_NOREF(pFpuState);
10308}
10309
10310#endif
10311
10312
10313IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10314{
10315 RTUINT128U uSrc1 = *puDst;
10316
10317 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10318 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10319 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10320 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10321 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10322 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10323 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10324 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10325 RT_NOREF(pFpuState);
10326}
10327
10328
10329IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10330{
10331 RTUINT128U uSrc1 = *puDst;
10332
10333 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10334 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10335 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10336 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10337 RT_NOREF(pFpuState);
10338}
10339
10340
10341IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10342 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10343{
10344 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10345 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10346 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10347 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10348 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10349 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10350 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10351 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10352 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10353 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10354 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10355 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10356 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10357 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10358 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10359 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10360 RT_NOREF(pExtState);
10361}
10362
10363
10364IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10365 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10366{
10367 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10368 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10369 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10370 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10371 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10372 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10373 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10374 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10375 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10376 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10377 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10378 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10379 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10380 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10381 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10382 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10383 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10384 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10385 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10386 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10387 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10388 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10389 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10390 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10391 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10392 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10393 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10394 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10395 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10396 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10397 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10398 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10399 RT_NOREF(pExtState);
10400}
10401
10402
10403IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10404 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10405{
10406 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10407 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10408 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10409 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10410 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10411 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10412 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10413 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10414 RT_NOREF(pExtState);
10415}
10416
10417
10418IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10419 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10420{
10421 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10422 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10423 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10424 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10425 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10426 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10427 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10428 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10429 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10430 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10431 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10432 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10433 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10434 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10435 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10436 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10437 RT_NOREF(pExtState);
10438}
10439
10440
10441IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10442 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10443{
10444 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10445 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10446 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10447 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10448 RT_NOREF(pExtState);
10449}
10450
10451
10452IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10453 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10454{
10455 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10456 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10457 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10458 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10459 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10460 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10461 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10462 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10463 RT_NOREF(pExtState);
10464}
10465
10466
10467/*
10468 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10469 */
10470#ifdef IEM_WITHOUT_ASSEMBLY
10471
10472IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10473{
10474 RTUINT64U uSrc1 = { *puDst };
10475 RTUINT64U uSrc2 = { *puSrc };
10476 RTUINT64U uDst;
10477
10478 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10479 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10480 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10481 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10482 *puDst = uDst.u;
10483 RT_NOREF(pFpuState);
10484}
10485
10486
10487IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10488{
10489 RTUINT128U uSrc1 = *puDst;
10490
10491 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10492 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10493 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10494 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10495 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10496 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10497 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10498 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10499 RT_NOREF(pFpuState);
10500}
10501
10502#endif
10503
10504IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10505{
10506 RTUINT128U uSrc1 = *puDst;
10507
10508 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10509 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10510 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10511 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10512 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10513 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10514 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10515 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10516 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10517 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10518 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10519 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10520 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10521 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10522 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10523 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10524 RT_NOREF(pFpuState);
10525}
10526
10527
10528IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10529{
10530 RTUINT128U uSrc1 = *puDst;
10531
10532 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10533 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10534 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10535 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10536 RT_NOREF(pFpuState);
10537}
10538
10539
10540IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10541 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10542{
10543 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10544 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10545 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10546 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10547 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10548 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10549 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10550 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10551 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10552 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10553 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10554 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10555 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10556 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10557 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10558 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10559 RT_NOREF(pExtState);
10560}
10561
10562
10563IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10564 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10565{
10566 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10567 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10568 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10569 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10570 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10571 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10572 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10573 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10574 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10575 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10576 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10577 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10578 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10579 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10580 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10581 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10582 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10583 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10584 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10585 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10586 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10587 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10588 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10589 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10590 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10591 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10592 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10593 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10594 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10595 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10596 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10597 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10598 RT_NOREF(pExtState);
10599}
10600
10601
10602IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10603 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10604{
10605 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10606 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10607 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10608 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10609 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10610 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10611 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10612 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10613 RT_NOREF(pExtState);
10614}
10615
10616
10617IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10618 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10619{
10620 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10621 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10622 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10623 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10624 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10625 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10626 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10627 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10628 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10629 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10630 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10631 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10632 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10633 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10634 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10635 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10636 RT_NOREF(pExtState);
10637}
10638
10639
10640IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10641 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10642{
10643 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10644 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10645 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10646 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10647 RT_NOREF(pExtState);
10648}
10649
10650
10651IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10652 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10653{
10654 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10655 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10656 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10657 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10658 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10659 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10660 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10661 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10662 RT_NOREF(pExtState);
10663}
10664
10665
10666/*
10667 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10668 */
10669#ifdef IEM_WITHOUT_ASSEMBLY
10670
10671IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10672{
10673 RTUINT64U uSrc1 = { *puDst };
10674 RTUINT64U uSrc2 = { *puSrc };
10675 RTUINT64U uDst;
10676
10677 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10678 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10679 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10680 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10681 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10682 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10683 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10684 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10685 *puDst = uDst.u;
10686 RT_NOREF(pFpuState);
10687}
10688
10689
10690IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10691{
10692 RTUINT128U uSrc1 = *puDst;
10693
10694 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10695 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10696 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10697 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10698 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10699 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
10700 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
10701 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
10702 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
10703 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
10704 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
10705 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
10706 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
10707 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
10708 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
10709 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
10710 RT_NOREF(pFpuState);
10711}
10712
10713#endif
10714
10715IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10716{
10717 RTUINT128U uSrc1 = *puDst;
10718
10719 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
10720 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
10721 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
10722 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
10723 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
10724 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
10725 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
10726 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
10727 RT_NOREF(pFpuState);
10728}
10729
10730
10731IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10732{
10733 RTUINT128U uSrc1 = *puDst;
10734
10735 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
10736 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
10737 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
10738 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
10739 RT_NOREF(pFpuState);
10740}
10741
10742
10743IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10744 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10745{
10746 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10747 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10748 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10749 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10750 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10751 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10752 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10753 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10754 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10755 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10756 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10757 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10758 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10759 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10760 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10761 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10762 RT_NOREF(pExtState);
10763}
10764
10765
10766IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10767 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10768{
10769 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10770 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10771 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10772 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10773 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10774 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10775 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10776 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10777 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10778 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10779 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
10780 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
10781 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
10782 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
10783 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
10784 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
10785 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
10786 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
10787 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
10788 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
10789 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
10790 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
10791 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
10792 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
10793 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
10794 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
10795 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
10796 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
10797 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
10798 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
10799 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
10800 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
10801 RT_NOREF(pExtState);
10802}
10803
10804
10805IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10806 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10807{
10808 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10809 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10810 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10811 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10812 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10813 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10814 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10815 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10816 RT_NOREF(pExtState);
10817}
10818
10819
10820IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10821 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10822{
10823 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10824 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10825 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10826 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10827 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10828 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10829 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10830 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10831 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10832 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10833 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
10834 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
10835 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
10836 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
10837 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
10838 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
10839 RT_NOREF(pExtState);
10840}
10841
10842
10843IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10844 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10845{
10846 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10847 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10848 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10849 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10850 RT_NOREF(pExtState);
10851}
10852
10853
10854IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10855 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10856{
10857 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10858 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10859 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10860 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10861 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10862 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10863 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10864 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10865 RT_NOREF(pExtState);
10866}
10867
10868
10869/*
10870 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
10871 */
10872#ifdef IEM_WITHOUT_ASSEMBLY
10873
10874IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10875{
10876 RTUINT64U uSrc1 = { *puDst };
10877 RTUINT64U uSrc2 = { *puSrc };
10878 RTUINT64U uDst;
10879
10880 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
10881 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
10882 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
10883 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
10884 *puDst = uDst.u;
10885 RT_NOREF(pFpuState);
10886}
10887
10888
10889IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10890{
10891 RTUINT128U uSrc1 = *puDst;
10892
10893 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10894 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10895 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10896 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10897 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10898 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10899 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10900 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10901 RT_NOREF(pFpuState);
10902}
10903
10904#endif
10905
10906IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10907{
10908 RTUINT128U uSrc1 = *puDst;
10909
10910 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10911 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10912 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10913 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10914 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10915 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10916 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10917 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10918 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10919 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10920 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
10921 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
10922 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
10923 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
10924 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
10925 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
10926 RT_NOREF(pFpuState);
10927}
10928
10929
10930IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10931{
10932 RTUINT128U uSrc1 = *puDst;
10933
10934 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10935 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10936 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10937 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10938 RT_NOREF(pFpuState);
10939}
10940
10941
10942IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10943 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10944{
10945 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10946 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10947 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10948 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10949 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10950 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10951 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10952 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10953 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10954 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10955 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10956 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10957 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10958 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10959 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10960 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10961 RT_NOREF(pExtState);
10962}
10963
10964
10965IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10966 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10967{
10968 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10969 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10970 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10971 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10972 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10973 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10974 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10975 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10976 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10977 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10978 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
10979 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
10980 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
10981 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
10982 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
10983 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
10984 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
10985 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
10986 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
10987 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
10988 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
10989 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
10990 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
10991 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
10992 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
10993 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
10994 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
10995 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
10996 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
10997 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
10998 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
10999 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11000 RT_NOREF(pExtState);
11001}
11002
11003
11004IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11005 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11006{
11007 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11008 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11009 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11010 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11011 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11012 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11013 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11014 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11015 RT_NOREF(pExtState);
11016}
11017
11018
11019IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11020 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11021{
11022 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11023 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11024 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11025 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11026 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11027 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11028 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11029 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11030 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11031 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11032 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
11033 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
11034 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
11035 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
11036 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
11037 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
11038 RT_NOREF(pExtState);
11039}
11040
11041
11042IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11043 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11044{
11045 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11046 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11047 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11048 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11049 RT_NOREF(pExtState);
11050}
11051
11052
11053IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11054 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11055{
11056 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11057 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11058 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11059 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11060 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11061 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11062 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11063 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11064 RT_NOREF(pExtState);
11065}
11066
11067
11068/*
11069 * PAVGB / VPAVGB / PAVGW / VPAVGW
11070 */
11071#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11072#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11073
11074#ifdef IEM_WITHOUT_ASSEMBLY
11075
11076IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11077{
11078 RTUINT64U uSrc1 = { *puDst };
11079 RTUINT64U uSrc2 = { *puSrc };
11080 RTUINT64U uDst;
11081
11082 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11083 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11084 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11085 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11086 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11087 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11088 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11089 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11090 *puDst = uDst.u;
11091}
11092
11093
11094IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11095{
11096 RTUINT128U uSrc1 = *puDst;
11097
11098 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11099 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11100 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11101 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11102 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11103 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11104 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11105 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11106 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11107 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11108 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11109 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11110 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11111 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11112 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11113 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11114}
11115
11116
11117IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11118{
11119 RTUINT64U uSrc1 = { *puDst };
11120 RTUINT64U uSrc2 = { *puSrc };
11121 RTUINT64U uDst;
11122
11123 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11124 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11125 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11126 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11127 *puDst = uDst.u;
11128}
11129
11130
11131IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11132{
11133 RTUINT128U uSrc1 = *puDst;
11134
11135 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11136 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11137 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11138 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11139 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11140 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11141 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11142 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11143}
11144
11145#endif
11146
11147IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11148{
11149 RTUINT128U uSrc1 = *puDst;
11150
11151 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11152 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11153 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11154 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11155 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11156 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11157 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11158 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11159 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11160 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11161 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11162 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11163 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11164 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11165 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11166 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11167}
11168
11169
11170IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11171{
11172 RTUINT128U uSrc1 = *puDst;
11173
11174 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11175 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11176 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11177 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11178 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11179 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11180 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11181 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11182 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11183 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11184 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11185 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11186 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11187 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11188 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11189 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11190}
11191
11192
11193IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11194{
11195 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11196 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11197 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11198 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11199 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11200 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11201 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11202 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11203 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11204 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11205 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11206 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11207 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11208 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11209 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11210 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11211}
11212
11213
11214IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11215{
11216 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11217 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11218 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11219 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11220 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11221 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11222 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11223 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11224 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11225 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11226 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11227 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11228 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11229 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11230 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11231 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11232 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11233 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11234 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11235 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11236 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11237 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11238 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11239 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11240 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11241 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11242 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11243 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11244 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11245 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11246 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11247 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11248}
11249
11250
11251IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11252{
11253 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11254 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11255 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11256 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11257 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11258 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11259 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11260 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11261}
11262
11263
11264IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11265{
11266 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11267 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11268 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11269 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11270 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11271 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11272 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11273 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11274 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11275 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11276 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11277 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11278 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11279 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11280 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11281 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11282}
11283
11284#undef PAVGB_EXEC
11285#undef PAVGW_EXEC
11286
11287
11288/*
11289 * PMOVMSKB / VPMOVMSKB
11290 */
11291#ifdef IEM_WITHOUT_ASSEMBLY
11292
11293IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11294{
11295 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11296 uint64_t const uSrc = *pu64Src;
11297 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11298 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11299 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11300 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11301 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11302 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11303 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11304 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11305}
11306
11307
11308IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11309{
11310 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11311 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11312 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11313 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11314 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11315 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11316 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11317 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11318 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11319 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11320 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11321 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11322 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11323 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11324 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11325 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11326 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11327 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11328 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11329}
11330
11331#endif
11332
11333IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11334{
11335 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11336 uint64_t const uSrc0 = puSrc->QWords.qw0;
11337 uint64_t const uSrc1 = puSrc->QWords.qw1;
11338 uint64_t const uSrc2 = puSrc->QWords.qw2;
11339 uint64_t const uSrc3 = puSrc->QWords.qw3;
11340 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11341 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11342 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11343 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11344 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11345 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11346 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11347 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11348 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11349 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11350 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11351 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11352 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11353 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11354 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11355 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11356 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11357 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11358 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11359 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11360 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11361 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11362 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11363 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11364 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11365 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11366 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11367 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11368 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11369 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11370 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11371 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11372}
11373
11374
11375/*
11376 * [V]PSHUFB
11377 */
11378
11379IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11380{
11381 RTUINT64U const uSrc = { *puSrc };
11382 RTUINT64U const uDstIn = { *puDst };
11383 ASMCompilerBarrier();
11384 RTUINT64U uDstOut = { 0 };
11385 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11386 {
11387 uint8_t idxSrc = uSrc.au8[iByte];
11388 if (!(idxSrc & 0x80))
11389 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11390 }
11391 *puDst = uDstOut.u;
11392 RT_NOREF(pFpuState);
11393}
11394
11395
11396IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11397{
11398 RTUINT128U const uSrc = *puSrc;
11399 RTUINT128U const uDstIn = *puDst;
11400 ASMCompilerBarrier();
11401 puDst->au64[0] = 0;
11402 puDst->au64[1] = 0;
11403 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11404 {
11405 uint8_t idxSrc = uSrc.au8[iByte];
11406 if (!(idxSrc & 0x80))
11407 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11408 }
11409 RT_NOREF(pFpuState);
11410}
11411
11412
11413IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11414 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11415{
11416 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11417 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11418 ASMCompilerBarrier();
11419 puDst->au64[0] = 0;
11420 puDst->au64[1] = 0;
11421 for (unsigned iByte = 0; iByte < 16; iByte++)
11422 {
11423 uint8_t idxSrc = uSrc2.au8[iByte];
11424 if (!(idxSrc & 0x80))
11425 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11426 }
11427 RT_NOREF(pExtState);
11428}
11429
11430
11431IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11432 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11433{
11434 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11435 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11436 ASMCompilerBarrier();
11437 puDst->au64[0] = 0;
11438 puDst->au64[1] = 0;
11439 puDst->au64[2] = 0;
11440 puDst->au64[3] = 0;
11441 for (unsigned iByte = 0; iByte < 16; iByte++)
11442 {
11443 uint8_t idxSrc = uSrc2.au8[iByte];
11444 if (!(idxSrc & 0x80))
11445 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11446 }
11447 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11448 {
11449 uint8_t idxSrc = uSrc2.au8[iByte];
11450 if (!(idxSrc & 0x80))
11451 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11452 }
11453 RT_NOREF(pExtState);
11454}
11455
11456
11457/*
11458 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11459 */
11460#ifdef IEM_WITHOUT_ASSEMBLY
11461
11462IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11463{
11464 uint64_t const uSrc = *puSrc;
11465 ASMCompilerBarrier();
11466 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11467 uSrc >> (((bEvil >> 2) & 3) * 16),
11468 uSrc >> (((bEvil >> 4) & 3) * 16),
11469 uSrc >> (((bEvil >> 6) & 3) * 16));
11470}
11471
11472
11473IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11474{
11475 puDst->QWords.qw0 = puSrc->QWords.qw0;
11476 uint64_t const uSrc = puSrc->QWords.qw1;
11477 ASMCompilerBarrier();
11478 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11479 uSrc >> (((bEvil >> 2) & 3) * 16),
11480 uSrc >> (((bEvil >> 4) & 3) * 16),
11481 uSrc >> (((bEvil >> 6) & 3) * 16));
11482}
11483
11484#endif
11485
11486IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11487{
11488 puDst->QWords.qw0 = puSrc->QWords.qw0;
11489 uint64_t const uSrc1 = puSrc->QWords.qw1;
11490 puDst->QWords.qw2 = puSrc->QWords.qw2;
11491 uint64_t const uSrc3 = puSrc->QWords.qw3;
11492 ASMCompilerBarrier();
11493 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11494 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11495 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11496 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11497 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11498 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11499 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11500 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11501}
11502
11503#ifdef IEM_WITHOUT_ASSEMBLY
11504IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11505{
11506 puDst->QWords.qw1 = puSrc->QWords.qw1;
11507 uint64_t const uSrc = puSrc->QWords.qw0;
11508 ASMCompilerBarrier();
11509 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11510 uSrc >> (((bEvil >> 2) & 3) * 16),
11511 uSrc >> (((bEvil >> 4) & 3) * 16),
11512 uSrc >> (((bEvil >> 6) & 3) * 16));
11513
11514}
11515#endif
11516
11517
11518IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11519{
11520 puDst->QWords.qw3 = puSrc->QWords.qw3;
11521 uint64_t const uSrc2 = puSrc->QWords.qw2;
11522 puDst->QWords.qw1 = puSrc->QWords.qw1;
11523 uint64_t const uSrc0 = puSrc->QWords.qw0;
11524 ASMCompilerBarrier();
11525 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11526 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11527 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11528 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11529 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11530 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11531 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11532 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11533
11534}
11535
11536
11537#ifdef IEM_WITHOUT_ASSEMBLY
11538IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11539{
11540 RTUINT128U const uSrc = *puSrc;
11541 ASMCompilerBarrier();
11542 puDst->au32[0] = uSrc.au32[bEvil & 3];
11543 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11544 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11545 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11546}
11547#endif
11548
11549
11550IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11551{
11552 RTUINT256U const uSrc = *puSrc;
11553 ASMCompilerBarrier();
11554 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11555 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11556 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11557 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11558 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11559 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11560 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11561 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11562}
11563
11564
11565/*
11566 * PUNPCKHBW - high bytes -> words
11567 */
11568#ifdef IEM_WITHOUT_ASSEMBLY
11569
11570IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11571{
11572 RTUINT64U const uSrc2 = { *puSrc };
11573 RTUINT64U const uSrc1 = { *puDst };
11574 ASMCompilerBarrier();
11575 RTUINT64U uDstOut;
11576 uDstOut.au8[0] = uSrc1.au8[4];
11577 uDstOut.au8[1] = uSrc2.au8[4];
11578 uDstOut.au8[2] = uSrc1.au8[5];
11579 uDstOut.au8[3] = uSrc2.au8[5];
11580 uDstOut.au8[4] = uSrc1.au8[6];
11581 uDstOut.au8[5] = uSrc2.au8[6];
11582 uDstOut.au8[6] = uSrc1.au8[7];
11583 uDstOut.au8[7] = uSrc2.au8[7];
11584 *puDst = uDstOut.u;
11585}
11586
11587
11588IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11589{
11590 RTUINT128U const uSrc2 = *puSrc;
11591 RTUINT128U const uSrc1 = *puDst;
11592 ASMCompilerBarrier();
11593 RTUINT128U uDstOut;
11594 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11595 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11596 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11597 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11598 uDstOut.au8[ 4] = uSrc1.au8[10];
11599 uDstOut.au8[ 5] = uSrc2.au8[10];
11600 uDstOut.au8[ 6] = uSrc1.au8[11];
11601 uDstOut.au8[ 7] = uSrc2.au8[11];
11602 uDstOut.au8[ 8] = uSrc1.au8[12];
11603 uDstOut.au8[ 9] = uSrc2.au8[12];
11604 uDstOut.au8[10] = uSrc1.au8[13];
11605 uDstOut.au8[11] = uSrc2.au8[13];
11606 uDstOut.au8[12] = uSrc1.au8[14];
11607 uDstOut.au8[13] = uSrc2.au8[14];
11608 uDstOut.au8[14] = uSrc1.au8[15];
11609 uDstOut.au8[15] = uSrc2.au8[15];
11610 *puDst = uDstOut;
11611}
11612
11613#endif
11614
11615IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11616{
11617 RTUINT128U const uSrc2 = *puSrc2;
11618 RTUINT128U const uSrc1 = *puSrc1;
11619 ASMCompilerBarrier();
11620 RTUINT128U uDstOut;
11621 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11622 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11623 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11624 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11625 uDstOut.au8[ 4] = uSrc1.au8[10];
11626 uDstOut.au8[ 5] = uSrc2.au8[10];
11627 uDstOut.au8[ 6] = uSrc1.au8[11];
11628 uDstOut.au8[ 7] = uSrc2.au8[11];
11629 uDstOut.au8[ 8] = uSrc1.au8[12];
11630 uDstOut.au8[ 9] = uSrc2.au8[12];
11631 uDstOut.au8[10] = uSrc1.au8[13];
11632 uDstOut.au8[11] = uSrc2.au8[13];
11633 uDstOut.au8[12] = uSrc1.au8[14];
11634 uDstOut.au8[13] = uSrc2.au8[14];
11635 uDstOut.au8[14] = uSrc1.au8[15];
11636 uDstOut.au8[15] = uSrc2.au8[15];
11637 *puDst = uDstOut;
11638}
11639
11640
11641IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11642{
11643 RTUINT256U const uSrc2 = *puSrc2;
11644 RTUINT256U const uSrc1 = *puSrc1;
11645 ASMCompilerBarrier();
11646 RTUINT256U uDstOut;
11647 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11648 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11649 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11650 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11651 uDstOut.au8[ 4] = uSrc1.au8[10];
11652 uDstOut.au8[ 5] = uSrc2.au8[10];
11653 uDstOut.au8[ 6] = uSrc1.au8[11];
11654 uDstOut.au8[ 7] = uSrc2.au8[11];
11655 uDstOut.au8[ 8] = uSrc1.au8[12];
11656 uDstOut.au8[ 9] = uSrc2.au8[12];
11657 uDstOut.au8[10] = uSrc1.au8[13];
11658 uDstOut.au8[11] = uSrc2.au8[13];
11659 uDstOut.au8[12] = uSrc1.au8[14];
11660 uDstOut.au8[13] = uSrc2.au8[14];
11661 uDstOut.au8[14] = uSrc1.au8[15];
11662 uDstOut.au8[15] = uSrc2.au8[15];
11663 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11664 uDstOut.au8[16] = uSrc1.au8[24];
11665 uDstOut.au8[17] = uSrc2.au8[24];
11666 uDstOut.au8[18] = uSrc1.au8[25];
11667 uDstOut.au8[19] = uSrc2.au8[25];
11668 uDstOut.au8[20] = uSrc1.au8[26];
11669 uDstOut.au8[21] = uSrc2.au8[26];
11670 uDstOut.au8[22] = uSrc1.au8[27];
11671 uDstOut.au8[23] = uSrc2.au8[27];
11672 uDstOut.au8[24] = uSrc1.au8[28];
11673 uDstOut.au8[25] = uSrc2.au8[28];
11674 uDstOut.au8[26] = uSrc1.au8[29];
11675 uDstOut.au8[27] = uSrc2.au8[29];
11676 uDstOut.au8[28] = uSrc1.au8[30];
11677 uDstOut.au8[29] = uSrc2.au8[30];
11678 uDstOut.au8[30] = uSrc1.au8[31];
11679 uDstOut.au8[31] = uSrc2.au8[31];
11680 *puDst = uDstOut;
11681}
11682
11683
11684/*
11685 * PUNPCKHBW - high words -> dwords
11686 */
11687#ifdef IEM_WITHOUT_ASSEMBLY
11688
11689IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11690{
11691 RTUINT64U const uSrc2 = { *puSrc };
11692 RTUINT64U const uSrc1 = { *puDst };
11693 ASMCompilerBarrier();
11694 RTUINT64U uDstOut;
11695 uDstOut.au16[0] = uSrc1.au16[2];
11696 uDstOut.au16[1] = uSrc2.au16[2];
11697 uDstOut.au16[2] = uSrc1.au16[3];
11698 uDstOut.au16[3] = uSrc2.au16[3];
11699 *puDst = uDstOut.u;
11700}
11701
11702
11703IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11704{
11705 RTUINT128U const uSrc2 = *puSrc;
11706 RTUINT128U const uSrc1 = *puDst;
11707 ASMCompilerBarrier();
11708 RTUINT128U uDstOut;
11709 uDstOut.au16[0] = uSrc1.au16[4];
11710 uDstOut.au16[1] = uSrc2.au16[4];
11711 uDstOut.au16[2] = uSrc1.au16[5];
11712 uDstOut.au16[3] = uSrc2.au16[5];
11713 uDstOut.au16[4] = uSrc1.au16[6];
11714 uDstOut.au16[5] = uSrc2.au16[6];
11715 uDstOut.au16[6] = uSrc1.au16[7];
11716 uDstOut.au16[7] = uSrc2.au16[7];
11717 *puDst = uDstOut;
11718}
11719
11720#endif
11721
11722IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11723{
11724 RTUINT128U const uSrc2 = *puSrc2;
11725 RTUINT128U const uSrc1 = *puSrc1;
11726 ASMCompilerBarrier();
11727 RTUINT128U uDstOut;
11728 uDstOut.au16[0] = uSrc1.au16[4];
11729 uDstOut.au16[1] = uSrc2.au16[4];
11730 uDstOut.au16[2] = uSrc1.au16[5];
11731 uDstOut.au16[3] = uSrc2.au16[5];
11732 uDstOut.au16[4] = uSrc1.au16[6];
11733 uDstOut.au16[5] = uSrc2.au16[6];
11734 uDstOut.au16[6] = uSrc1.au16[7];
11735 uDstOut.au16[7] = uSrc2.au16[7];
11736 *puDst = uDstOut;
11737}
11738
11739
11740IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11741{
11742 RTUINT256U const uSrc2 = *puSrc2;
11743 RTUINT256U const uSrc1 = *puSrc1;
11744 ASMCompilerBarrier();
11745 RTUINT256U uDstOut;
11746 uDstOut.au16[0] = uSrc1.au16[4];
11747 uDstOut.au16[1] = uSrc2.au16[4];
11748 uDstOut.au16[2] = uSrc1.au16[5];
11749 uDstOut.au16[3] = uSrc2.au16[5];
11750 uDstOut.au16[4] = uSrc1.au16[6];
11751 uDstOut.au16[5] = uSrc2.au16[6];
11752 uDstOut.au16[6] = uSrc1.au16[7];
11753 uDstOut.au16[7] = uSrc2.au16[7];
11754
11755 uDstOut.au16[8] = uSrc1.au16[12];
11756 uDstOut.au16[9] = uSrc2.au16[12];
11757 uDstOut.au16[10] = uSrc1.au16[13];
11758 uDstOut.au16[11] = uSrc2.au16[13];
11759 uDstOut.au16[12] = uSrc1.au16[14];
11760 uDstOut.au16[13] = uSrc2.au16[14];
11761 uDstOut.au16[14] = uSrc1.au16[15];
11762 uDstOut.au16[15] = uSrc2.au16[15];
11763 *puDst = uDstOut;
11764}
11765
11766
11767/*
11768 * PUNPCKHBW - high dwords -> qword(s)
11769 */
11770#ifdef IEM_WITHOUT_ASSEMBLY
11771
11772IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
11773{
11774 RTUINT64U const uSrc2 = { *puSrc };
11775 RTUINT64U const uSrc1 = { *puDst };
11776 ASMCompilerBarrier();
11777 RTUINT64U uDstOut;
11778 uDstOut.au32[0] = uSrc1.au32[1];
11779 uDstOut.au32[1] = uSrc2.au32[1];
11780 *puDst = uDstOut.u;
11781}
11782
11783
11784IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11785{
11786 RTUINT128U const uSrc2 = *puSrc;
11787 RTUINT128U const uSrc1 = *puDst;
11788 ASMCompilerBarrier();
11789 RTUINT128U uDstOut;
11790 uDstOut.au32[0] = uSrc1.au32[2];
11791 uDstOut.au32[1] = uSrc2.au32[2];
11792 uDstOut.au32[2] = uSrc1.au32[3];
11793 uDstOut.au32[3] = uSrc2.au32[3];
11794 *puDst = uDstOut;
11795}
11796
11797#endif
11798
11799IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11800{
11801 RTUINT128U const uSrc2 = *puSrc2;
11802 RTUINT128U const uSrc1 = *puSrc1;
11803 ASMCompilerBarrier();
11804 RTUINT128U uDstOut;
11805 uDstOut.au32[0] = uSrc1.au32[2];
11806 uDstOut.au32[1] = uSrc2.au32[2];
11807 uDstOut.au32[2] = uSrc1.au32[3];
11808 uDstOut.au32[3] = uSrc2.au32[3];
11809 *puDst = uDstOut;
11810}
11811
11812
11813IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11814{
11815 RTUINT256U const uSrc2 = *puSrc2;
11816 RTUINT256U const uSrc1 = *puSrc1;
11817 ASMCompilerBarrier();
11818 RTUINT256U uDstOut;
11819 uDstOut.au32[0] = uSrc1.au32[2];
11820 uDstOut.au32[1] = uSrc2.au32[2];
11821 uDstOut.au32[2] = uSrc1.au32[3];
11822 uDstOut.au32[3] = uSrc2.au32[3];
11823
11824 uDstOut.au32[4] = uSrc1.au32[6];
11825 uDstOut.au32[5] = uSrc2.au32[6];
11826 uDstOut.au32[6] = uSrc1.au32[7];
11827 uDstOut.au32[7] = uSrc2.au32[7];
11828 *puDst = uDstOut;
11829}
11830
11831
11832/*
11833 * PUNPCKHQDQ -> High qwords -> double qword(s).
11834 */
11835#ifdef IEM_WITHOUT_ASSEMBLY
11836IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11837{
11838 RTUINT128U const uSrc2 = *puSrc;
11839 RTUINT128U const uSrc1 = *puDst;
11840 ASMCompilerBarrier();
11841 RTUINT128U uDstOut;
11842 uDstOut.au64[0] = uSrc1.au64[1];
11843 uDstOut.au64[1] = uSrc2.au64[1];
11844 *puDst = uDstOut;
11845}
11846#endif
11847
11848
11849IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11850{
11851 RTUINT128U const uSrc2 = *puSrc2;
11852 RTUINT128U const uSrc1 = *puSrc1;
11853 ASMCompilerBarrier();
11854 RTUINT128U uDstOut;
11855 uDstOut.au64[0] = uSrc1.au64[1];
11856 uDstOut.au64[1] = uSrc2.au64[1];
11857 *puDst = uDstOut;
11858}
11859
11860
11861IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11862{
11863 RTUINT256U const uSrc2 = *puSrc2;
11864 RTUINT256U const uSrc1 = *puSrc1;
11865 ASMCompilerBarrier();
11866 RTUINT256U uDstOut;
11867 uDstOut.au64[0] = uSrc1.au64[1];
11868 uDstOut.au64[1] = uSrc2.au64[1];
11869
11870 uDstOut.au64[2] = uSrc1.au64[3];
11871 uDstOut.au64[3] = uSrc2.au64[3];
11872 *puDst = uDstOut;
11873}
11874
11875
11876/*
11877 * PUNPCKLBW - low bytes -> words
11878 */
11879#ifdef IEM_WITHOUT_ASSEMBLY
11880
11881IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11882{
11883 RTUINT64U const uSrc2 = { *puSrc };
11884 RTUINT64U const uSrc1 = { *puDst };
11885 ASMCompilerBarrier();
11886 RTUINT64U uDstOut;
11887 uDstOut.au8[0] = uSrc1.au8[0];
11888 uDstOut.au8[1] = uSrc2.au8[0];
11889 uDstOut.au8[2] = uSrc1.au8[1];
11890 uDstOut.au8[3] = uSrc2.au8[1];
11891 uDstOut.au8[4] = uSrc1.au8[2];
11892 uDstOut.au8[5] = uSrc2.au8[2];
11893 uDstOut.au8[6] = uSrc1.au8[3];
11894 uDstOut.au8[7] = uSrc2.au8[3];
11895 *puDst = uDstOut.u;
11896}
11897
11898
11899IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11900{
11901 RTUINT128U const uSrc2 = *puSrc;
11902 RTUINT128U const uSrc1 = *puDst;
11903 ASMCompilerBarrier();
11904 RTUINT128U uDstOut;
11905 uDstOut.au8[ 0] = uSrc1.au8[0];
11906 uDstOut.au8[ 1] = uSrc2.au8[0];
11907 uDstOut.au8[ 2] = uSrc1.au8[1];
11908 uDstOut.au8[ 3] = uSrc2.au8[1];
11909 uDstOut.au8[ 4] = uSrc1.au8[2];
11910 uDstOut.au8[ 5] = uSrc2.au8[2];
11911 uDstOut.au8[ 6] = uSrc1.au8[3];
11912 uDstOut.au8[ 7] = uSrc2.au8[3];
11913 uDstOut.au8[ 8] = uSrc1.au8[4];
11914 uDstOut.au8[ 9] = uSrc2.au8[4];
11915 uDstOut.au8[10] = uSrc1.au8[5];
11916 uDstOut.au8[11] = uSrc2.au8[5];
11917 uDstOut.au8[12] = uSrc1.au8[6];
11918 uDstOut.au8[13] = uSrc2.au8[6];
11919 uDstOut.au8[14] = uSrc1.au8[7];
11920 uDstOut.au8[15] = uSrc2.au8[7];
11921 *puDst = uDstOut;
11922}
11923
11924#endif
11925
11926IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11927{
11928 RTUINT128U const uSrc2 = *puSrc2;
11929 RTUINT128U const uSrc1 = *puSrc1;
11930 ASMCompilerBarrier();
11931 RTUINT128U uDstOut;
11932 uDstOut.au8[ 0] = uSrc1.au8[0];
11933 uDstOut.au8[ 1] = uSrc2.au8[0];
11934 uDstOut.au8[ 2] = uSrc1.au8[1];
11935 uDstOut.au8[ 3] = uSrc2.au8[1];
11936 uDstOut.au8[ 4] = uSrc1.au8[2];
11937 uDstOut.au8[ 5] = uSrc2.au8[2];
11938 uDstOut.au8[ 6] = uSrc1.au8[3];
11939 uDstOut.au8[ 7] = uSrc2.au8[3];
11940 uDstOut.au8[ 8] = uSrc1.au8[4];
11941 uDstOut.au8[ 9] = uSrc2.au8[4];
11942 uDstOut.au8[10] = uSrc1.au8[5];
11943 uDstOut.au8[11] = uSrc2.au8[5];
11944 uDstOut.au8[12] = uSrc1.au8[6];
11945 uDstOut.au8[13] = uSrc2.au8[6];
11946 uDstOut.au8[14] = uSrc1.au8[7];
11947 uDstOut.au8[15] = uSrc2.au8[7];
11948 *puDst = uDstOut;
11949}
11950
11951
11952IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11953{
11954 RTUINT256U const uSrc2 = *puSrc2;
11955 RTUINT256U const uSrc1 = *puSrc1;
11956 ASMCompilerBarrier();
11957 RTUINT256U uDstOut;
11958 uDstOut.au8[ 0] = uSrc1.au8[0];
11959 uDstOut.au8[ 1] = uSrc2.au8[0];
11960 uDstOut.au8[ 2] = uSrc1.au8[1];
11961 uDstOut.au8[ 3] = uSrc2.au8[1];
11962 uDstOut.au8[ 4] = uSrc1.au8[2];
11963 uDstOut.au8[ 5] = uSrc2.au8[2];
11964 uDstOut.au8[ 6] = uSrc1.au8[3];
11965 uDstOut.au8[ 7] = uSrc2.au8[3];
11966 uDstOut.au8[ 8] = uSrc1.au8[4];
11967 uDstOut.au8[ 9] = uSrc2.au8[4];
11968 uDstOut.au8[10] = uSrc1.au8[5];
11969 uDstOut.au8[11] = uSrc2.au8[5];
11970 uDstOut.au8[12] = uSrc1.au8[6];
11971 uDstOut.au8[13] = uSrc2.au8[6];
11972 uDstOut.au8[14] = uSrc1.au8[7];
11973 uDstOut.au8[15] = uSrc2.au8[7];
11974 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11975 uDstOut.au8[16] = uSrc1.au8[16];
11976 uDstOut.au8[17] = uSrc2.au8[16];
11977 uDstOut.au8[18] = uSrc1.au8[17];
11978 uDstOut.au8[19] = uSrc2.au8[17];
11979 uDstOut.au8[20] = uSrc1.au8[18];
11980 uDstOut.au8[21] = uSrc2.au8[18];
11981 uDstOut.au8[22] = uSrc1.au8[19];
11982 uDstOut.au8[23] = uSrc2.au8[19];
11983 uDstOut.au8[24] = uSrc1.au8[20];
11984 uDstOut.au8[25] = uSrc2.au8[20];
11985 uDstOut.au8[26] = uSrc1.au8[21];
11986 uDstOut.au8[27] = uSrc2.au8[21];
11987 uDstOut.au8[28] = uSrc1.au8[22];
11988 uDstOut.au8[29] = uSrc2.au8[22];
11989 uDstOut.au8[30] = uSrc1.au8[23];
11990 uDstOut.au8[31] = uSrc2.au8[23];
11991 *puDst = uDstOut;
11992}
11993
11994
11995/*
11996 * PUNPCKLBW - low words -> dwords
11997 */
11998#ifdef IEM_WITHOUT_ASSEMBLY
11999
12000IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12001{
12002 RTUINT64U const uSrc2 = { *puSrc };
12003 RTUINT64U const uSrc1 = { *puDst };
12004 ASMCompilerBarrier();
12005 RTUINT64U uDstOut;
12006 uDstOut.au16[0] = uSrc1.au16[0];
12007 uDstOut.au16[1] = uSrc2.au16[0];
12008 uDstOut.au16[2] = uSrc1.au16[1];
12009 uDstOut.au16[3] = uSrc2.au16[1];
12010 *puDst = uDstOut.u;
12011}
12012
12013
12014IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12015{
12016 RTUINT128U const uSrc2 = *puSrc;
12017 RTUINT128U const uSrc1 = *puDst;
12018 ASMCompilerBarrier();
12019 RTUINT128U uDstOut;
12020 uDstOut.au16[0] = uSrc1.au16[0];
12021 uDstOut.au16[1] = uSrc2.au16[0];
12022 uDstOut.au16[2] = uSrc1.au16[1];
12023 uDstOut.au16[3] = uSrc2.au16[1];
12024 uDstOut.au16[4] = uSrc1.au16[2];
12025 uDstOut.au16[5] = uSrc2.au16[2];
12026 uDstOut.au16[6] = uSrc1.au16[3];
12027 uDstOut.au16[7] = uSrc2.au16[3];
12028 *puDst = uDstOut;
12029}
12030
12031#endif
12032
12033IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12034{
12035 RTUINT128U const uSrc2 = *puSrc2;
12036 RTUINT128U const uSrc1 = *puSrc1;
12037 ASMCompilerBarrier();
12038 RTUINT128U uDstOut;
12039 uDstOut.au16[0] = uSrc1.au16[0];
12040 uDstOut.au16[1] = uSrc2.au16[0];
12041 uDstOut.au16[2] = uSrc1.au16[1];
12042 uDstOut.au16[3] = uSrc2.au16[1];
12043 uDstOut.au16[4] = uSrc1.au16[2];
12044 uDstOut.au16[5] = uSrc2.au16[2];
12045 uDstOut.au16[6] = uSrc1.au16[3];
12046 uDstOut.au16[7] = uSrc2.au16[3];
12047 *puDst = uDstOut;
12048}
12049
12050
12051IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12052{
12053 RTUINT256U const uSrc2 = *puSrc2;
12054 RTUINT256U const uSrc1 = *puSrc1;
12055 ASMCompilerBarrier();
12056 RTUINT256U uDstOut;
12057 uDstOut.au16[0] = uSrc1.au16[0];
12058 uDstOut.au16[1] = uSrc2.au16[0];
12059 uDstOut.au16[2] = uSrc1.au16[1];
12060 uDstOut.au16[3] = uSrc2.au16[1];
12061 uDstOut.au16[4] = uSrc1.au16[2];
12062 uDstOut.au16[5] = uSrc2.au16[2];
12063 uDstOut.au16[6] = uSrc1.au16[3];
12064 uDstOut.au16[7] = uSrc2.au16[3];
12065
12066 uDstOut.au16[8] = uSrc1.au16[8];
12067 uDstOut.au16[9] = uSrc2.au16[8];
12068 uDstOut.au16[10] = uSrc1.au16[9];
12069 uDstOut.au16[11] = uSrc2.au16[9];
12070 uDstOut.au16[12] = uSrc1.au16[10];
12071 uDstOut.au16[13] = uSrc2.au16[10];
12072 uDstOut.au16[14] = uSrc1.au16[11];
12073 uDstOut.au16[15] = uSrc2.au16[11];
12074 *puDst = uDstOut;
12075}
12076
12077
12078/*
12079 * PUNPCKLBW - low dwords -> qword(s)
12080 */
12081#ifdef IEM_WITHOUT_ASSEMBLY
12082
12083IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12084{
12085 RTUINT64U const uSrc2 = { *puSrc };
12086 RTUINT64U const uSrc1 = { *puDst };
12087 ASMCompilerBarrier();
12088 RTUINT64U uDstOut;
12089 uDstOut.au32[0] = uSrc1.au32[0];
12090 uDstOut.au32[1] = uSrc2.au32[0];
12091 *puDst = uDstOut.u;
12092}
12093
12094
12095IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12096{
12097 RTUINT128U const uSrc2 = *puSrc;
12098 RTUINT128U const uSrc1 = *puDst;
12099 ASMCompilerBarrier();
12100 RTUINT128U uDstOut;
12101 uDstOut.au32[0] = uSrc1.au32[0];
12102 uDstOut.au32[1] = uSrc2.au32[0];
12103 uDstOut.au32[2] = uSrc1.au32[1];
12104 uDstOut.au32[3] = uSrc2.au32[1];
12105 *puDst = uDstOut;
12106}
12107
12108#endif
12109
12110IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12111{
12112 RTUINT128U const uSrc2 = *puSrc2;
12113 RTUINT128U const uSrc1 = *puSrc1;
12114 ASMCompilerBarrier();
12115 RTUINT128U uDstOut;
12116 uDstOut.au32[0] = uSrc1.au32[0];
12117 uDstOut.au32[1] = uSrc2.au32[0];
12118 uDstOut.au32[2] = uSrc1.au32[1];
12119 uDstOut.au32[3] = uSrc2.au32[1];
12120 *puDst = uDstOut;
12121}
12122
12123
12124IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12125{
12126 RTUINT256U const uSrc2 = *puSrc2;
12127 RTUINT256U const uSrc1 = *puSrc1;
12128 ASMCompilerBarrier();
12129 RTUINT256U uDstOut;
12130 uDstOut.au32[0] = uSrc1.au32[0];
12131 uDstOut.au32[1] = uSrc2.au32[0];
12132 uDstOut.au32[2] = uSrc1.au32[1];
12133 uDstOut.au32[3] = uSrc2.au32[1];
12134
12135 uDstOut.au32[4] = uSrc1.au32[4];
12136 uDstOut.au32[5] = uSrc2.au32[4];
12137 uDstOut.au32[6] = uSrc1.au32[5];
12138 uDstOut.au32[7] = uSrc2.au32[5];
12139 *puDst = uDstOut;
12140}
12141
12142
12143/*
12144 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12145 */
12146#ifdef IEM_WITHOUT_ASSEMBLY
12147IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12148{
12149 RTUINT128U const uSrc2 = *puSrc;
12150 RTUINT128U const uSrc1 = *puDst;
12151 ASMCompilerBarrier();
12152 RTUINT128U uDstOut;
12153 uDstOut.au64[0] = uSrc1.au64[0];
12154 uDstOut.au64[1] = uSrc2.au64[0];
12155 *puDst = uDstOut;
12156}
12157#endif
12158
12159
12160IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12161{
12162 RTUINT128U const uSrc2 = *puSrc2;
12163 RTUINT128U const uSrc1 = *puSrc1;
12164 ASMCompilerBarrier();
12165 RTUINT128U uDstOut;
12166 uDstOut.au64[0] = uSrc1.au64[0];
12167 uDstOut.au64[1] = uSrc2.au64[0];
12168 *puDst = uDstOut;
12169}
12170
12171
12172IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12173{
12174 RTUINT256U const uSrc2 = *puSrc2;
12175 RTUINT256U const uSrc1 = *puSrc1;
12176 ASMCompilerBarrier();
12177 RTUINT256U uDstOut;
12178 uDstOut.au64[0] = uSrc1.au64[0];
12179 uDstOut.au64[1] = uSrc2.au64[0];
12180
12181 uDstOut.au64[2] = uSrc1.au64[2];
12182 uDstOut.au64[3] = uSrc2.au64[2];
12183 *puDst = uDstOut;
12184}
12185
12186
12187/*
12188 * PACKSSWB - signed words -> signed bytes
12189 */
12190
12191#ifdef IEM_WITHOUT_ASSEMBLY
12192
12193IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12194{
12195 RTUINT64U const uSrc2 = { *puSrc };
12196 RTUINT64U const uSrc1 = { *puDst };
12197 ASMCompilerBarrier();
12198 RTUINT64U uDstOut;
12199 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12200 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12201 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12202 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12203 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12204 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12205 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12206 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12207 *puDst = uDstOut.u;
12208}
12209
12210
12211IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12212{
12213 RTUINT128U const uSrc2 = *puSrc;
12214 RTUINT128U const uSrc1 = *puDst;
12215 ASMCompilerBarrier();
12216 RTUINT128U uDstOut;
12217 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12218 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12219 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12220 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12221 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12222 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12223 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12224 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12225 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12226 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12227 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12228 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12229 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12230 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12231 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12232 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12233 *puDst = uDstOut;
12234}
12235
12236#endif
12237
12238IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12239{
12240 RTUINT128U const uSrc2 = *puSrc2;
12241 RTUINT128U const uSrc1 = *puSrc1;
12242 ASMCompilerBarrier();
12243 RTUINT128U uDstOut;
12244 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12245 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12246 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12247 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12248 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12249 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12250 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12251 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12252 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12253 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12254 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12255 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12256 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12257 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12258 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12259 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12260 *puDst = uDstOut;
12261}
12262
12263
12264IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12265{
12266 RTUINT256U const uSrc2 = *puSrc2;
12267 RTUINT256U const uSrc1 = *puSrc1;
12268 ASMCompilerBarrier();
12269 RTUINT256U uDstOut;
12270 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12271 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12272 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12273 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12274 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12275 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12276 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12277 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12278 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12279 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12280 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12281 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12282 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12283 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12284 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12285 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12286
12287 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12288 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12289 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12290 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12291 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12292 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12293 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12294 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12295 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12296 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12297 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12298 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12299 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12300 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12301 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12302 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12303 *puDst = uDstOut;
12304}
12305
12306
12307/*
12308 * PACKUSWB - signed words -> unsigned bytes
12309 */
12310#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12311 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12312 ? (uint8_t)(a_iWord) \
12313 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12314
12315#ifdef IEM_WITHOUT_ASSEMBLY
12316
12317IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12318{
12319 RTUINT64U const uSrc2 = { *puSrc };
12320 RTUINT64U const uSrc1 = { *puDst };
12321 ASMCompilerBarrier();
12322 RTUINT64U uDstOut;
12323 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12324 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12325 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12326 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12327 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12328 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12329 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12330 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12331 *puDst = uDstOut.u;
12332}
12333
12334
12335IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12336{
12337 RTUINT128U const uSrc2 = *puSrc;
12338 RTUINT128U const uSrc1 = *puDst;
12339 ASMCompilerBarrier();
12340 RTUINT128U uDstOut;
12341 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12342 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12343 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12344 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12345 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12346 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12347 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12348 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12349 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12350 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12351 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12352 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12353 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12354 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12355 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12356 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12357 *puDst = uDstOut;
12358}
12359
12360#endif
12361
12362IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12363{
12364 RTUINT128U const uSrc2 = *puSrc2;
12365 RTUINT128U const uSrc1 = *puSrc1;
12366 ASMCompilerBarrier();
12367 RTUINT128U uDstOut;
12368 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12369 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12370 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12371 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12372 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12373 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12374 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12375 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12376 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12377 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12378 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12379 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12380 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12381 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12382 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12383 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12384 *puDst = uDstOut;
12385}
12386
12387
12388IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12389{
12390 RTUINT256U const uSrc2 = *puSrc2;
12391 RTUINT256U const uSrc1 = *puSrc1;
12392 ASMCompilerBarrier();
12393 RTUINT256U uDstOut;
12394 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12395 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12396 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12397 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12398 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12399 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12400 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12401 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12402 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12403 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12404 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12405 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12406 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12407 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12408 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12409 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12410
12411 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12412 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12413 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12414 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12415 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12416 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12417 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12418 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12419 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12420 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12421 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12422 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12423 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12424 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12425 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12426 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12427 *puDst = uDstOut;
12428}
12429
12430
12431/*
12432 * PACKSSDW - signed dwords -> signed words
12433 */
12434
12435#ifdef IEM_WITHOUT_ASSEMBLY
12436
12437IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12438{
12439 RTUINT64U const uSrc2 = { *puSrc };
12440 RTUINT64U const uSrc1 = { *puDst };
12441 ASMCompilerBarrier();
12442 RTUINT64U uDstOut;
12443 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12444 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12445 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12446 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12447 *puDst = uDstOut.u;
12448}
12449
12450
12451IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12452{
12453 RTUINT128U const uSrc2 = *puSrc;
12454 RTUINT128U const uSrc1 = *puDst;
12455 ASMCompilerBarrier();
12456 RTUINT128U uDstOut;
12457 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12458 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12459 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12460 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12461 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12462 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12463 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12464 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12465 *puDst = uDstOut;
12466}
12467
12468#endif
12469
12470IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12471{
12472 RTUINT128U const uSrc2 = *puSrc2;
12473 RTUINT128U const uSrc1 = *puSrc1;
12474 ASMCompilerBarrier();
12475 RTUINT128U uDstOut;
12476 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12477 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12478 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12479 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12480 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12481 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12482 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12483 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12484 *puDst = uDstOut;
12485}
12486
12487
12488IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12489{
12490 RTUINT256U const uSrc2 = *puSrc2;
12491 RTUINT256U const uSrc1 = *puSrc1;
12492 ASMCompilerBarrier();
12493 RTUINT256U uDstOut;
12494 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12495 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12496 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12497 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12498 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12499 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12500 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12501 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12502
12503 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12504 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12505 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12506 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12507 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12508 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12509 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12510 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12511 *puDst = uDstOut;
12512}
12513
12514
12515/*
12516 * PACKUSDW - signed dwords -> unsigned words
12517 */
12518#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12519 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12520 ? (uint16_t)(a_iDword) \
12521 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12522
12523#ifdef IEM_WITHOUT_ASSEMBLY
12524IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12525{
12526 RTUINT128U const uSrc2 = *puSrc;
12527 RTUINT128U const uSrc1 = *puDst;
12528 ASMCompilerBarrier();
12529 RTUINT128U uDstOut;
12530 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12531 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12532 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12533 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12534 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12535 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12536 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12537 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12538 *puDst = uDstOut;
12539}
12540#endif
12541
12542IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12543{
12544 RTUINT128U const uSrc2 = *puSrc2;
12545 RTUINT128U const uSrc1 = *puSrc1;
12546 ASMCompilerBarrier();
12547 RTUINT128U uDstOut;
12548 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12549 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12550 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12551 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12552 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12553 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12554 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12555 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12556 *puDst = uDstOut;
12557}
12558
12559
12560IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12561{
12562 RTUINT256U const uSrc2 = *puSrc2;
12563 RTUINT256U const uSrc1 = *puSrc1;
12564 ASMCompilerBarrier();
12565 RTUINT256U uDstOut;
12566 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12567 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12568 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12569 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12570 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12571 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12572 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12573 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12574
12575 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12576 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12577 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12578 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12579 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12580 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12581 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12582 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12583 *puDst = uDstOut;
12584}
12585
12586
12587/*
12588 * [V]PABSB / [V]PABSW / [V]PABSD
12589 */
12590
12591IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12592{
12593 RTUINT64U const uSrc = { *puSrc };
12594 RTUINT64U uDstOut = { 0 };
12595
12596 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12597 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12598 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12599 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12600 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12601 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12602 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12603 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12604 *puDst = uDstOut.u;
12605 RT_NOREF(pFpuState);
12606}
12607
12608
12609IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12610{
12611 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12612 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12613 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12614 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12615 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12616 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12617 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12618 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12619 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12620 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12621 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12622 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12623 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12624 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12625 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12626 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12627 RT_NOREF(pFpuState);
12628}
12629
12630
12631IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12632{
12633 RTUINT64U const uSrc = { *puSrc };
12634 RTUINT64U uDstOut = { 0 };
12635
12636 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12637 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12638 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12639 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12640 *puDst = uDstOut.u;
12641 RT_NOREF(pFpuState);
12642}
12643
12644
12645IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12646{
12647 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12648 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12649 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12650 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12651 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12652 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12653 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12654 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12655 RT_NOREF(pFpuState);
12656}
12657
12658
12659IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12660{
12661 RTUINT64U const uSrc = { *puSrc };
12662 RTUINT64U uDstOut = { 0 };
12663
12664 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12665 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12666 *puDst = uDstOut.u;
12667 RT_NOREF(pFpuState);
12668}
12669
12670
12671IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12672{
12673 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12674 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12675 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12676 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12677 RT_NOREF(pFpuState);
12678}
12679
12680
12681IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12682{
12683 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12684 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12685 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12686 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12687 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12688 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12689 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12690 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12691 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12692 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12693 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12694 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12695 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12696 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12697 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12698 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12699}
12700
12701
12702IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12703{
12704 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12705 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12706 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12707 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12708 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12709 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12710 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12711 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12712 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12713 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12714 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12715 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12716 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12717 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12718 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12719 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12720 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
12721 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
12722 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
12723 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
12724 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
12725 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
12726 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
12727 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
12728 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
12729 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
12730 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
12731 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
12732 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
12733 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
12734 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
12735 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
12736}
12737
12738
12739IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12740{
12741 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12742 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12743 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12744 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12745 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12746 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12747 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12748 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12749}
12750
12751
12752IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12753{
12754 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12755 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12756 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12757 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12758 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12759 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12760 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12761 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12762 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
12763 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
12764 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
12765 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
12766 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
12767 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
12768 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
12769 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
12770}
12771
12772
12773IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12774{
12775 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12776 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12777 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12778 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12779}
12780
12781
12782IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
12783{
12784 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12785 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12786 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12787 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12788 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
12789 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
12790 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
12791 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
12792}
12793
12794
12795/*
12796 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
12797 */
12798IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12799{
12800 RTUINT64U uSrc1 = { *puDst };
12801 RTUINT64U uSrc2 = { *puSrc };
12802 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12803
12804 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
12805 {
12806 if (uSrc2.ai8[i] < 0)
12807 uDst.ai8[i] = -uSrc1.ai8[i];
12808 else if (uSrc2.ai8[i] == 0)
12809 uDst.ai8[i] = 0;
12810 else /* uSrc2.ai8[i] > 0 */
12811 uDst.ai8[i] = uSrc1.ai8[i];
12812 }
12813
12814 *puDst = uDst.u;
12815 RT_NOREF(pFpuState);
12816}
12817
12818
12819IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12820{
12821 RTUINT128U uSrc1 = *puDst;
12822
12823 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12824 {
12825 if (puSrc->ai8[i] < 0)
12826 puDst->ai8[i] = -uSrc1.ai8[i];
12827 else if (puSrc->ai8[i] == 0)
12828 puDst->ai8[i] = 0;
12829 else /* puSrc->ai8[i] > 0 */
12830 puDst->ai8[i] = uSrc1.ai8[i];
12831 }
12832
12833 RT_NOREF(pFpuState);
12834}
12835
12836
12837IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12838{
12839 RTUINT64U uSrc1 = { *puDst };
12840 RTUINT64U uSrc2 = { *puSrc };
12841 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12842
12843 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
12844 {
12845 if (uSrc2.ai16[i] < 0)
12846 uDst.ai16[i] = -uSrc1.ai16[i];
12847 else if (uSrc2.ai16[i] == 0)
12848 uDst.ai16[i] = 0;
12849 else /* uSrc2.ai16[i] > 0 */
12850 uDst.ai16[i] = uSrc1.ai16[i];
12851 }
12852
12853 *puDst = uDst.u;
12854 RT_NOREF(pFpuState);
12855}
12856
12857
12858IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12859{
12860 RTUINT128U uSrc1 = *puDst;
12861
12862 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12863 {
12864 if (puSrc->ai16[i] < 0)
12865 puDst->ai16[i] = -uSrc1.ai16[i];
12866 else if (puSrc->ai16[i] == 0)
12867 puDst->ai16[i] = 0;
12868 else /* puSrc->ai16[i] > 0 */
12869 puDst->ai16[i] = uSrc1.ai16[i];
12870 }
12871
12872 RT_NOREF(pFpuState);
12873}
12874
12875
12876IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12877{
12878 RTUINT64U uSrc1 = { *puDst };
12879 RTUINT64U uSrc2 = { *puSrc };
12880 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
12881
12882 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
12883 {
12884 if (uSrc2.ai32[i] < 0)
12885 uDst.ai32[i] = -uSrc1.ai32[i];
12886 else if (uSrc2.ai32[i] == 0)
12887 uDst.ai32[i] = 0;
12888 else /* uSrc2.ai32[i] > 0 */
12889 uDst.ai32[i] = uSrc1.ai32[i];
12890 }
12891
12892 *puDst = uDst.u;
12893 RT_NOREF(pFpuState);
12894}
12895
12896
12897IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12898{
12899 RTUINT128U uSrc1 = *puDst;
12900
12901 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12902 {
12903 if (puSrc->ai32[i] < 0)
12904 puDst->ai32[i] = -uSrc1.ai32[i];
12905 else if (puSrc->ai32[i] == 0)
12906 puDst->ai32[i] = 0;
12907 else /* puSrc->ai32[i] > 0 */
12908 puDst->ai32[i] = uSrc1.ai32[i];
12909 }
12910
12911 RT_NOREF(pFpuState);
12912}
12913
12914
12915IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12916{
12917 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12918 {
12919 if (puSrc2->ai8[i] < 0)
12920 puDst->ai8[i] = -puSrc1->ai8[i];
12921 else if (puSrc2->ai8[i] == 0)
12922 puDst->ai8[i] = 0;
12923 else /* puSrc2->ai8[i] > 0 */
12924 puDst->ai8[i] = puSrc1->ai8[i];
12925 }
12926}
12927
12928
12929IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12930{
12931 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
12932 {
12933 if (puSrc2->ai8[i] < 0)
12934 puDst->ai8[i] = -puSrc1->ai8[i];
12935 else if (puSrc2->ai8[i] == 0)
12936 puDst->ai8[i] = 0;
12937 else /* puSrc2->ai8[i] > 0 */
12938 puDst->ai8[i] = puSrc1->ai8[i];
12939 }
12940}
12941
12942
12943IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12944{
12945 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12946 {
12947 if (puSrc2->ai16[i] < 0)
12948 puDst->ai16[i] = -puSrc1->ai16[i];
12949 else if (puSrc2->ai16[i] == 0)
12950 puDst->ai16[i] = 0;
12951 else /* puSrc2->ai16[i] > 0 */
12952 puDst->ai16[i] = puSrc1->ai16[i];
12953 }
12954}
12955
12956
12957IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12958{
12959 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
12960 {
12961 if (puSrc2->ai16[i] < 0)
12962 puDst->ai16[i] = -puSrc1->ai16[i];
12963 else if (puSrc2->ai16[i] == 0)
12964 puDst->ai16[i] = 0;
12965 else /* puSrc2->ai16[i] > 0 */
12966 puDst->ai16[i] = puSrc1->ai16[i];
12967 }
12968}
12969
12970
12971IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12972{
12973 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12974 {
12975 if (puSrc2->ai32[i] < 0)
12976 puDst->ai32[i] = -puSrc1->ai32[i];
12977 else if (puSrc2->ai32[i] == 0)
12978 puDst->ai32[i] = 0;
12979 else /* puSrc2->ai32[i] > 0 */
12980 puDst->ai32[i] = puSrc1->ai32[i];
12981 }
12982}
12983
12984
12985IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12986{
12987 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
12988 {
12989 if (puSrc2->ai32[i] < 0)
12990 puDst->ai32[i] = -puSrc1->ai32[i];
12991 else if (puSrc2->ai32[i] == 0)
12992 puDst->ai32[i] = 0;
12993 else /* puSrc2->ai32[i] > 0 */
12994 puDst->ai32[i] = puSrc1->ai32[i];
12995 }
12996}
12997
12998
12999/*
13000 * PHADDW / VPHADDW / PHADDD / VPHADDD
13001 */
13002IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13003{
13004 RTUINT64U uSrc1 = { *puDst };
13005 RTUINT64U uSrc2 = { *puSrc };
13006 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13007
13008 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13009 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13010 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13011 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13012 *puDst = uDst.u;
13013 RT_NOREF(pFpuState);
13014}
13015
13016
13017IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13018{
13019 RTUINT128U uSrc1 = *puDst;
13020
13021 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13022 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13023 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13024 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13025
13026 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13027 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13028 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13029 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13030 RT_NOREF(pFpuState);
13031}
13032
13033
13034IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13035{
13036 RTUINT64U uSrc1 = { *puDst };
13037 RTUINT64U uSrc2 = { *puSrc };
13038 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13039
13040 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13041 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
13042 *puDst = uDst.u;
13043 RT_NOREF(pFpuState);
13044}
13045
13046
13047IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13048{
13049 RTUINT128U uSrc1 = *puDst;
13050
13051 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13052 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13053
13054 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13055 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13056 RT_NOREF(pFpuState);
13057}
13058
13059
13060IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13061{
13062 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13063
13064 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13065 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13066 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13067 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13068
13069 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13070 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13071 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13072 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13073
13074 puDst->au64[0] = uDst.au64[0];
13075 puDst->au64[1] = uDst.au64[1];
13076}
13077
13078
13079IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13080{
13081 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13082
13083 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13084 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13085 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13086 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13087 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13088 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13089 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13090 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13091
13092 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13093 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13094 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13095 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13096 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13097 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13098 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13099 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13100
13101 puDst->au64[0] = uDst.au64[0];
13102 puDst->au64[1] = uDst.au64[1];
13103 puDst->au64[2] = uDst.au64[2];
13104 puDst->au64[3] = uDst.au64[3];
13105}
13106
13107
13108IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13109{
13110 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13111
13112 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13113 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13114
13115 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13116 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13117
13118 puDst->au64[0] = uDst.au64[0];
13119 puDst->au64[1] = uDst.au64[1];
13120}
13121
13122
13123IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13124{
13125 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13126
13127 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13128 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13129 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13130 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13131
13132 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13133 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13134 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13135 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13136
13137 puDst->au64[0] = uDst.au64[0];
13138 puDst->au64[1] = uDst.au64[1];
13139 puDst->au64[2] = uDst.au64[2];
13140 puDst->au64[3] = uDst.au64[3];
13141}
13142
13143
13144/*
13145 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13146 */
13147IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13148{
13149 RTUINT64U uSrc1 = { *puDst };
13150 RTUINT64U uSrc2 = { *puSrc };
13151 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13152
13153 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13154 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13155 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13156 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13157 *puDst = uDst.u;
13158 RT_NOREF(pFpuState);
13159}
13160
13161
13162IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13163{
13164 RTUINT128U uSrc1 = *puDst;
13165
13166 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13167 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13168 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13169 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13170
13171 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13172 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13173 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13174 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13175 RT_NOREF(pFpuState);
13176}
13177
13178
13179IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13180{
13181 RTUINT64U uSrc1 = { *puDst };
13182 RTUINT64U uSrc2 = { *puSrc };
13183 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13184
13185 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13186 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13187 *puDst = uDst.u;
13188 RT_NOREF(pFpuState);
13189}
13190
13191
13192IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13193{
13194 RTUINT128U uSrc1 = *puDst;
13195
13196 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13197 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13198
13199 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13200 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13201 RT_NOREF(pFpuState);
13202}
13203
13204
13205IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13206{
13207 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13208
13209 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13210 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13211 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13212 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13213
13214 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13215 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13216 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13217 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13218
13219 puDst->au64[0] = uDst.au64[0];
13220 puDst->au64[1] = uDst.au64[1];
13221}
13222
13223
13224IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13225{
13226 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13227
13228 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13229 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13230 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13231 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13232 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13233 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13234 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13235 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13236
13237 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13238 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13239 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13240 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13241 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13242 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13243 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13244 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13245
13246 puDst->au64[0] = uDst.au64[0];
13247 puDst->au64[1] = uDst.au64[1];
13248 puDst->au64[2] = uDst.au64[2];
13249 puDst->au64[3] = uDst.au64[3];
13250}
13251
13252
13253IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13254{
13255 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13256
13257 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13258 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13259
13260 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13261 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13262
13263 puDst->au64[0] = uDst.au64[0];
13264 puDst->au64[1] = uDst.au64[1];
13265}
13266
13267
13268IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13269{
13270 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13271
13272 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13273 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13274 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13275 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13276
13277 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13278 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13279 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13280 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13281
13282 puDst->au64[0] = uDst.au64[0];
13283 puDst->au64[1] = uDst.au64[1];
13284 puDst->au64[2] = uDst.au64[2];
13285 puDst->au64[3] = uDst.au64[3];
13286}
13287
13288
13289/*
13290 * PHADDSW / VPHADDSW
13291 */
13292IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13293{
13294 RTUINT64U uSrc1 = { *puDst };
13295 RTUINT64U uSrc2 = { *puSrc };
13296 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13297
13298 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13299 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13300 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13301 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13302 *puDst = uDst.u;
13303 RT_NOREF(pFpuState);
13304}
13305
13306
13307IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13308{
13309 RTUINT128U uSrc1 = *puDst;
13310
13311 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13312 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13313 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13314 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13315
13316 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13317 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13318 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13319 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13320 RT_NOREF(pFpuState);
13321}
13322
13323
13324IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13325{
13326 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13327
13328 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13329 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13330 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13331 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13332
13333 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13334 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13335 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13336 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13337
13338 puDst->au64[0] = uDst.au64[0];
13339 puDst->au64[1] = uDst.au64[1];
13340}
13341
13342
13343IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13344{
13345 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13346
13347 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13348 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13349 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13350 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13351 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13352 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13353 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13354 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13355
13356 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13357 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13358 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13359 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13360 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13361 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13362 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13363 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13364
13365 puDst->au64[0] = uDst.au64[0];
13366 puDst->au64[1] = uDst.au64[1];
13367 puDst->au64[2] = uDst.au64[2];
13368 puDst->au64[3] = uDst.au64[3];
13369}
13370
13371
13372/*
13373 * PHSUBSW / VPHSUBSW
13374 */
13375IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13376{
13377 RTUINT64U uSrc1 = { *puDst };
13378 RTUINT64U uSrc2 = { *puSrc };
13379 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13380
13381 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13382 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13383 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13384 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13385 *puDst = uDst.u;
13386 RT_NOREF(pFpuState);
13387}
13388
13389
13390IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13391{
13392 RTUINT128U uSrc1 = *puDst;
13393
13394 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13395 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13396 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13397 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13398
13399 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13400 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13401 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13402 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13403 RT_NOREF(pFpuState);
13404}
13405
13406
13407IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13408{
13409 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13410
13411 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13412 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13413 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13414 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13415
13416 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13417 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13418 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13419 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13420
13421 puDst->au64[0] = uDst.au64[0];
13422 puDst->au64[1] = uDst.au64[1];
13423}
13424
13425
13426IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13427{
13428 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13429
13430 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13431 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13432 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13433 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13434 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13435 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13436 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13437 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13438
13439 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13440 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13441 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13442 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13443 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13444 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13445 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13446 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13447
13448 puDst->au64[0] = uDst.au64[0];
13449 puDst->au64[1] = uDst.au64[1];
13450 puDst->au64[2] = uDst.au64[2];
13451 puDst->au64[3] = uDst.au64[3];
13452}
13453
13454
13455/*
13456 * PMADDUBSW / VPMADDUBSW
13457 */
13458IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13459{
13460 RTUINT64U uSrc1 = { *puDst };
13461 RTUINT64U uSrc2 = { *puSrc };
13462 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13463
13464 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13465 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13466 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13467 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13468 *puDst = uDst.u;
13469 RT_NOREF(pFpuState);
13470}
13471
13472
13473IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13474{
13475 RTUINT128U uSrc1 = *puDst;
13476
13477 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13478 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13479 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13480 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13481 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13482 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13483 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13484 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13485 RT_NOREF(pFpuState);
13486}
13487
13488
13489IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13490{
13491 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13492
13493 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13494 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13495 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13496 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13497 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13498 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13499 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13500 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13501
13502 puDst->au64[0] = uDst.au64[0];
13503 puDst->au64[1] = uDst.au64[1];
13504}
13505
13506
13507IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13508{
13509 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13510
13511 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13512 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13513 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13514 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13515 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13516 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13517 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13518 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13519 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13520 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13521 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13522 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13523 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13524 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13525 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13526 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13527
13528 puDst->au64[0] = uDst.au64[0];
13529 puDst->au64[1] = uDst.au64[1];
13530 puDst->au64[2] = uDst.au64[2];
13531 puDst->au64[3] = uDst.au64[3];
13532}
13533
13534
13535/*
13536 * PMULHRSW / VPMULHRSW
13537 */
13538#define DO_PMULHRSW(a_Src1, a_Src2) \
13539 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13540
13541IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13542{
13543 RTUINT64U uSrc1 = { *puDst };
13544 RTUINT64U uSrc2 = { *puSrc };
13545 RTUINT64U uDst;
13546
13547 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13548 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13549 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13550 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13551 *puDst = uDst.u;
13552 RT_NOREF(pFpuState);
13553}
13554
13555
13556IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13557{
13558 RTUINT128U uSrc1 = *puDst;
13559
13560 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13561 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13562 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13563 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13564 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13565 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13566 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13567 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13568 RT_NOREF(pFpuState);
13569}
13570
13571
13572IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13573{
13574 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13575
13576 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13577 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13578 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13579 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13580 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13581 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13582 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13583 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13584
13585 puDst->au64[0] = uDst.au64[0];
13586 puDst->au64[1] = uDst.au64[1];
13587}
13588
13589
13590IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13591{
13592 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13593
13594 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13595 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13596 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13597 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13598 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13599 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13600 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13601 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13602 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13603 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13604 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13605 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13606 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13607 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13608 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13609 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13610
13611 puDst->au64[0] = uDst.au64[0];
13612 puDst->au64[1] = uDst.au64[1];
13613 puDst->au64[2] = uDst.au64[2];
13614 puDst->au64[3] = uDst.au64[3];
13615}
13616
13617
13618/*
13619 * PSADBW / VPSADBW
13620 */
13621#ifdef IEM_WITHOUT_ASSEMBLY
13622
13623IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13624{
13625 RTUINT64U uSrc1 = { *puDst };
13626 RTUINT64U uSrc2 = { *puSrc };
13627 RTUINT64U uDst;
13628 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13629 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13630 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13631 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13632 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13633 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13634 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13635 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13636
13637 uDst.au64[0] = 0;
13638 uDst.au16[0] = uSum;
13639 *puDst = uDst.u;
13640}
13641
13642
13643IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13644{
13645 RTUINT128U uSrc1 = *puDst;
13646
13647 puDst->au64[0] = 0;
13648 puDst->au64[1] = 0;
13649
13650 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13651 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13652 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13653 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13654 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13655 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13656 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13657 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13658 puDst->au16[0] = uSum;
13659
13660 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13661 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13662 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13663 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13664 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13665 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13666 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13667 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13668 puDst->au16[4] = uSum;
13669}
13670
13671#endif
13672
13673IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13674{
13675 RTUINT128U uSrc1 = *puSrc1;
13676 RTUINT128U uSrc2 = *puSrc2;
13677
13678 puDst->au64[0] = 0;
13679 puDst->au64[1] = 0;
13680
13681 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13682 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13683 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13684 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13685 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13686 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13687 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13688 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13689 puDst->au16[0] = uSum;
13690
13691 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13692 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13693 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13694 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13695 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13696 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13697 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13698 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13699 puDst->au16[4] = uSum;
13700}
13701
13702IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13703{
13704 RTUINT256U uSrc1 = *puSrc1;
13705 RTUINT256U uSrc2 = *puSrc2;
13706
13707 puDst->au64[0] = 0;
13708 puDst->au64[1] = 0;
13709 puDst->au64[2] = 0;
13710 puDst->au64[3] = 0;
13711
13712 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13713 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13714 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13715 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13716 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13717 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13718 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13719 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13720 puDst->au16[0] = uSum;
13721
13722 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13723 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13724 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13725 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13726 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13727 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13728 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13729 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13730 puDst->au16[4] = uSum;
13731
13732 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
13733 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
13734 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
13735 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
13736 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
13737 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
13738 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
13739 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
13740 puDst->au16[8] = uSum;
13741
13742 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
13743 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
13744 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
13745 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
13746 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
13747 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
13748 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
13749 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
13750 puDst->au16[12] = uSum;
13751}
13752
13753
13754/*
13755 * PMULDQ / VPMULDQ
13756 */
13757IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13758{
13759 RTUINT128U uSrc1 = *puDst;
13760
13761 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
13762 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
13763}
13764
13765IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13766{
13767 RTUINT128U uSrc1 = *puSrc1;
13768 RTUINT128U uSrc2 = *puSrc2;
13769
13770 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13771 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13772}
13773
13774IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13775{
13776 RTUINT256U uSrc1 = *puSrc1;
13777 RTUINT256U uSrc2 = *puSrc2;
13778
13779 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
13780 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
13781 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
13782 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
13783}
13784
13785
13786/*
13787 * PMULUDQ / VPMULUDQ
13788 */
13789#ifdef IEM_WITHOUT_ASSEMBLY
13790
13791IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13792{
13793 RTUINT64U uSrc1 = { *puDst };
13794 RTUINT64U uSrc2 = { *puSrc };
13795 ASMCompilerBarrier();
13796 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13797 RT_NOREF(pFpuState);
13798}
13799
13800
13801IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13802{
13803 RTUINT128U uSrc1 = *puDst;
13804 RTUINT128U uSrc2 = *puSrc;
13805 ASMCompilerBarrier();
13806 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13807 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13808 RT_NOREF(pFpuState);
13809}
13810
13811#endif
13812
13813IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13814{
13815 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13816 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13817 ASMCompilerBarrier();
13818 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13819 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13820}
13821
13822
13823IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13824{
13825 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13826 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13827 ASMCompilerBarrier();
13828 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
13829 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
13830 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
13831 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
13832}
13833
13834
13835/*
13836 * UNPCKLPS / VUNPCKLPS
13837 */
13838#ifdef IEM_WITHOUT_ASSEMBLY
13839IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13840{
13841 RTUINT128U uSrc1 = *puDst;
13842 RTUINT128U uSrc2 = *puSrc;
13843 ASMCompilerBarrier();
13844 puDst->au32[0] = uSrc1.au32[0];
13845 puDst->au32[1] = uSrc2.au32[0];
13846 puDst->au32[2] = uSrc1.au32[1];
13847 puDst->au32[3] = uSrc2.au32[1];
13848}
13849
13850#endif
13851
13852IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13853{
13854 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13855 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13856 ASMCompilerBarrier();
13857 puDst->au32[0] = uSrc1.au32[0];
13858 puDst->au32[1] = uSrc2.au32[0];
13859 puDst->au32[2] = uSrc1.au32[1];
13860 puDst->au32[3] = uSrc2.au32[1];
13861}
13862
13863
13864IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13865{
13866 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13867 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13868 ASMCompilerBarrier();
13869 puDst->au32[0] = uSrc1.au32[0];
13870 puDst->au32[1] = uSrc2.au32[0];
13871 puDst->au32[2] = uSrc1.au32[1];
13872 puDst->au32[3] = uSrc2.au32[1];
13873
13874 puDst->au32[4] = uSrc1.au32[4];
13875 puDst->au32[5] = uSrc2.au32[4];
13876 puDst->au32[6] = uSrc1.au32[5];
13877 puDst->au32[7] = uSrc2.au32[5];
13878}
13879
13880
13881/*
13882 * UNPCKLPD / VUNPCKLPD
13883 */
13884#ifdef IEM_WITHOUT_ASSEMBLY
13885IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13886{
13887 RTUINT128U uSrc1 = *puDst;
13888 RTUINT128U uSrc2 = *puSrc;
13889 ASMCompilerBarrier();
13890 puDst->au64[0] = uSrc1.au64[0];
13891 puDst->au64[1] = uSrc2.au64[0];
13892}
13893
13894#endif
13895
13896IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13897{
13898 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13899 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13900 ASMCompilerBarrier();
13901 puDst->au64[0] = uSrc1.au64[0];
13902 puDst->au64[1] = uSrc2.au64[0];
13903}
13904
13905
13906IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13907{
13908 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13909 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13910 ASMCompilerBarrier();
13911 puDst->au64[0] = uSrc1.au64[0];
13912 puDst->au64[1] = uSrc2.au64[0];
13913 puDst->au64[2] = uSrc1.au64[2];
13914 puDst->au64[3] = uSrc2.au64[2];
13915}
13916
13917
13918/*
13919 * UNPCKHPS / VUNPCKHPS
13920 */
13921#ifdef IEM_WITHOUT_ASSEMBLY
13922IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13923{
13924 RTUINT128U uSrc1 = *puDst;
13925 RTUINT128U uSrc2 = *puSrc;
13926 ASMCompilerBarrier();
13927 puDst->au32[0] = uSrc1.au32[2];
13928 puDst->au32[1] = uSrc2.au32[2];
13929 puDst->au32[2] = uSrc1.au32[3];
13930 puDst->au32[3] = uSrc2.au32[3];
13931}
13932
13933#endif
13934
13935IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13936{
13937 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13938 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13939 ASMCompilerBarrier();
13940 puDst->au32[0] = uSrc1.au32[2];
13941 puDst->au32[1] = uSrc2.au32[2];
13942 puDst->au32[2] = uSrc1.au32[3];
13943 puDst->au32[3] = uSrc2.au32[3];
13944}
13945
13946
13947IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13948{
13949 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13950 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13951 ASMCompilerBarrier();
13952 puDst->au32[0] = uSrc1.au32[2];
13953 puDst->au32[1] = uSrc2.au32[2];
13954 puDst->au32[2] = uSrc1.au32[3];
13955 puDst->au32[3] = uSrc2.au32[3];
13956
13957 puDst->au32[4] = uSrc1.au32[6];
13958 puDst->au32[5] = uSrc2.au32[6];
13959 puDst->au32[6] = uSrc1.au32[7];
13960 puDst->au32[7] = uSrc2.au32[7];
13961}
13962
13963
13964/*
13965 * UNPCKHPD / VUNPCKHPD
13966 */
13967#ifdef IEM_WITHOUT_ASSEMBLY
13968IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13969{
13970 RTUINT128U uSrc1 = *puDst;
13971 RTUINT128U uSrc2 = *puSrc;
13972 ASMCompilerBarrier();
13973 puDst->au64[0] = uSrc1.au64[1];
13974 puDst->au64[1] = uSrc2.au64[1];
13975}
13976
13977#endif
13978
13979IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13980{
13981 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
13982 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
13983 ASMCompilerBarrier();
13984 puDst->au64[0] = uSrc1.au64[1];
13985 puDst->au64[1] = uSrc2.au64[1];
13986}
13987
13988
13989IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13990{
13991 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
13992 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
13993 ASMCompilerBarrier();
13994 puDst->au64[0] = uSrc1.au64[1];
13995 puDst->au64[1] = uSrc2.au64[1];
13996 puDst->au64[2] = uSrc1.au64[3];
13997 puDst->au64[3] = uSrc2.au64[3];
13998}
13999
14000
14001/*
14002 * CRC32 (SEE 4.2).
14003 */
14004
14005IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14006{
14007 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14008}
14009
14010
14011IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14012{
14013 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14014}
14015
14016IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14017{
14018 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14019}
14020
14021IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14022{
14023 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14024}
14025
14026
14027/*
14028 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14029 */
14030#ifdef IEM_WITHOUT_ASSEMBLY
14031IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14032{
14033 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14034 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14035 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14036 fEfl |= X86_EFL_ZF;
14037 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14038 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14039 fEfl |= X86_EFL_CF;
14040 *pfEFlags = fEfl;
14041}
14042#endif
14043
14044IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14045{
14046 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14047 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14048 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14049 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14050 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14051 fEfl |= X86_EFL_ZF;
14052 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14053 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14054 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14055 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14056 fEfl |= X86_EFL_CF;
14057 *pfEFlags = fEfl;
14058}
14059
14060
14061/*
14062 * PMOVSXBW / VPMOVSXBW
14063 */
14064IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14065{
14066 RTUINT64U uSrc1 = { uSrc };
14067 puDst->ai16[0] = uSrc1.ai8[0];
14068 puDst->ai16[1] = uSrc1.ai8[1];
14069 puDst->ai16[2] = uSrc1.ai8[2];
14070 puDst->ai16[3] = uSrc1.ai8[3];
14071 puDst->ai16[4] = uSrc1.ai8[4];
14072 puDst->ai16[5] = uSrc1.ai8[5];
14073 puDst->ai16[6] = uSrc1.ai8[6];
14074 puDst->ai16[7] = uSrc1.ai8[7];
14075}
14076
14077
14078IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14079{
14080 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14081 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14082 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14083 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14084 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14085 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14086 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14087 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14088 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14089 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14090 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14091 puDst->ai16[10] = uSrc1.ai8[10];
14092 puDst->ai16[11] = uSrc1.ai8[11];
14093 puDst->ai16[12] = uSrc1.ai8[12];
14094 puDst->ai16[13] = uSrc1.ai8[13];
14095 puDst->ai16[14] = uSrc1.ai8[14];
14096 puDst->ai16[15] = uSrc1.ai8[15];
14097}
14098
14099
14100/*
14101 * PMOVSXBD / VPMOVSXBD
14102 */
14103IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14104{
14105 RTUINT32U uSrc1 = { uSrc };
14106 puDst->ai32[0] = uSrc1.ai8[0];
14107 puDst->ai32[1] = uSrc1.ai8[1];
14108 puDst->ai32[2] = uSrc1.ai8[2];
14109 puDst->ai32[3] = uSrc1.ai8[3];
14110}
14111
14112
14113IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14114{
14115 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14116 puDst->ai32[0] = uSrc1.ai8[0];
14117 puDst->ai32[1] = uSrc1.ai8[1];
14118 puDst->ai32[2] = uSrc1.ai8[2];
14119 puDst->ai32[3] = uSrc1.ai8[3];
14120 puDst->ai32[4] = uSrc1.ai8[4];
14121 puDst->ai32[5] = uSrc1.ai8[5];
14122 puDst->ai32[6] = uSrc1.ai8[6];
14123 puDst->ai32[7] = uSrc1.ai8[7];
14124}
14125
14126
14127/*
14128 * PMOVSXBQ / VPMOVSXBQ
14129 */
14130IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14131{
14132 RTUINT16U uSrc1 = { uSrc };
14133 puDst->ai64[0] = uSrc1.ai8[0];
14134 puDst->ai64[1] = uSrc1.ai8[1];
14135}
14136
14137
14138IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14139{
14140 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14141 puDst->ai64[0] = uSrc1.ai8[0];
14142 puDst->ai64[1] = uSrc1.ai8[1];
14143 puDst->ai64[2] = uSrc1.ai8[2];
14144 puDst->ai64[3] = uSrc1.ai8[3];
14145}
14146
14147
14148/*
14149 * PMOVSXWD / VPMOVSXWD
14150 */
14151IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14152{
14153 RTUINT64U uSrc1 = { uSrc };
14154 puDst->ai32[0] = uSrc1.ai16[0];
14155 puDst->ai32[1] = uSrc1.ai16[1];
14156 puDst->ai32[2] = uSrc1.ai16[2];
14157 puDst->ai32[3] = uSrc1.ai16[3];
14158}
14159
14160
14161IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14162{
14163 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14164 puDst->ai32[0] = uSrc1.ai16[0];
14165 puDst->ai32[1] = uSrc1.ai16[1];
14166 puDst->ai32[2] = uSrc1.ai16[2];
14167 puDst->ai32[3] = uSrc1.ai16[3];
14168 puDst->ai32[4] = uSrc1.ai16[4];
14169 puDst->ai32[5] = uSrc1.ai16[5];
14170 puDst->ai32[6] = uSrc1.ai16[6];
14171 puDst->ai32[7] = uSrc1.ai16[7];
14172}
14173
14174
14175/*
14176 * PMOVSXWQ / VPMOVSXWQ
14177 */
14178IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14179{
14180 RTUINT32U uSrc1 = { uSrc };
14181 puDst->ai64[0] = uSrc1.ai16[0];
14182 puDst->ai64[1] = uSrc1.ai16[1];
14183}
14184
14185
14186IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14187{
14188 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14189 puDst->ai64[0] = uSrc1.ai16[0];
14190 puDst->ai64[1] = uSrc1.ai16[1];
14191 puDst->ai64[2] = uSrc1.ai16[2];
14192 puDst->ai64[3] = uSrc1.ai16[3];
14193}
14194
14195
14196/*
14197 * PMOVSXDQ / VPMOVSXDQ
14198 */
14199IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14200{
14201 RTUINT64U uSrc1 = { uSrc };
14202 puDst->ai64[0] = uSrc1.ai32[0];
14203 puDst->ai64[1] = uSrc1.ai32[1];
14204}
14205
14206
14207IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14208{
14209 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14210 puDst->ai64[0] = uSrc1.ai32[0];
14211 puDst->ai64[1] = uSrc1.ai32[1];
14212 puDst->ai64[2] = uSrc1.ai32[2];
14213 puDst->ai64[3] = uSrc1.ai32[3];
14214}
14215
14216
14217/*
14218 * PMOVZXBW / VPMOVZXBW
14219 */
14220IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14221{
14222 RTUINT64U uSrc1 = { uSrc };
14223 puDst->au16[0] = uSrc1.au8[0];
14224 puDst->au16[1] = uSrc1.au8[1];
14225 puDst->au16[2] = uSrc1.au8[2];
14226 puDst->au16[3] = uSrc1.au8[3];
14227 puDst->au16[4] = uSrc1.au8[4];
14228 puDst->au16[5] = uSrc1.au8[5];
14229 puDst->au16[6] = uSrc1.au8[6];
14230 puDst->au16[7] = uSrc1.au8[7];
14231}
14232
14233
14234IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14235{
14236 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14237 puDst->au16[ 0] = uSrc1.au8[ 0];
14238 puDst->au16[ 1] = uSrc1.au8[ 1];
14239 puDst->au16[ 2] = uSrc1.au8[ 2];
14240 puDst->au16[ 3] = uSrc1.au8[ 3];
14241 puDst->au16[ 4] = uSrc1.au8[ 4];
14242 puDst->au16[ 5] = uSrc1.au8[ 5];
14243 puDst->au16[ 6] = uSrc1.au8[ 6];
14244 puDst->au16[ 7] = uSrc1.au8[ 7];
14245 puDst->au16[ 8] = uSrc1.au8[ 8];
14246 puDst->au16[ 9] = uSrc1.au8[ 9];
14247 puDst->au16[10] = uSrc1.au8[10];
14248 puDst->au16[11] = uSrc1.au8[11];
14249 puDst->au16[12] = uSrc1.au8[12];
14250 puDst->au16[13] = uSrc1.au8[13];
14251 puDst->au16[14] = uSrc1.au8[14];
14252 puDst->au16[15] = uSrc1.au8[15];
14253}
14254
14255
14256/*
14257 * PMOVZXBD / VPMOVZXBD
14258 */
14259IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14260{
14261 RTUINT32U uSrc1 = { uSrc };
14262 puDst->au32[0] = uSrc1.au8[0];
14263 puDst->au32[1] = uSrc1.au8[1];
14264 puDst->au32[2] = uSrc1.au8[2];
14265 puDst->au32[3] = uSrc1.au8[3];
14266}
14267
14268
14269IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14270{
14271 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14272 puDst->au32[0] = uSrc1.au8[0];
14273 puDst->au32[1] = uSrc1.au8[1];
14274 puDst->au32[2] = uSrc1.au8[2];
14275 puDst->au32[3] = uSrc1.au8[3];
14276 puDst->au32[4] = uSrc1.au8[4];
14277 puDst->au32[5] = uSrc1.au8[5];
14278 puDst->au32[6] = uSrc1.au8[6];
14279 puDst->au32[7] = uSrc1.au8[7];
14280}
14281
14282
14283/*
14284 * PMOVZXBQ / VPMOVZXBQ
14285 */
14286IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14287{
14288 RTUINT16U uSrc1 = { uSrc };
14289 puDst->au64[0] = uSrc1.au8[0];
14290 puDst->au64[1] = uSrc1.au8[1];
14291}
14292
14293
14294IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14295{
14296 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14297 puDst->au64[0] = uSrc1.au8[0];
14298 puDst->au64[1] = uSrc1.au8[1];
14299 puDst->au64[2] = uSrc1.au8[2];
14300 puDst->au64[3] = uSrc1.au8[3];
14301}
14302
14303
14304/*
14305 * PMOVZXWD / VPMOVZXWD
14306 */
14307IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14308{
14309 RTUINT64U uSrc1 = { uSrc };
14310 puDst->au32[0] = uSrc1.au16[0];
14311 puDst->au32[1] = uSrc1.au16[1];
14312 puDst->au32[2] = uSrc1.au16[2];
14313 puDst->au32[3] = uSrc1.au16[3];
14314}
14315
14316
14317IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14318{
14319 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14320 puDst->au32[0] = uSrc1.au16[0];
14321 puDst->au32[1] = uSrc1.au16[1];
14322 puDst->au32[2] = uSrc1.au16[2];
14323 puDst->au32[3] = uSrc1.au16[3];
14324 puDst->au32[4] = uSrc1.au16[4];
14325 puDst->au32[5] = uSrc1.au16[5];
14326 puDst->au32[6] = uSrc1.au16[6];
14327 puDst->au32[7] = uSrc1.au16[7];
14328}
14329
14330
14331/*
14332 * PMOVZXWQ / VPMOVZXWQ
14333 */
14334IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14335{
14336 RTUINT32U uSrc1 = { uSrc };
14337 puDst->au64[0] = uSrc1.au16[0];
14338 puDst->au64[1] = uSrc1.au16[1];
14339}
14340
14341
14342IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14343{
14344 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14345 puDst->au64[0] = uSrc1.au16[0];
14346 puDst->au64[1] = uSrc1.au16[1];
14347 puDst->au64[2] = uSrc1.au16[2];
14348 puDst->au64[3] = uSrc1.au16[3];
14349}
14350
14351
14352/*
14353 * PMOVZXDQ / VPMOVZXDQ
14354 */
14355IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14356{
14357 RTUINT64U uSrc1 = { uSrc };
14358 puDst->au64[0] = uSrc1.au32[0];
14359 puDst->au64[1] = uSrc1.au32[1];
14360}
14361
14362
14363IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14364{
14365 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14366 puDst->au64[0] = uSrc1.au32[0];
14367 puDst->au64[1] = uSrc1.au32[1];
14368 puDst->au64[2] = uSrc1.au32[2];
14369 puDst->au64[3] = uSrc1.au32[3];
14370}
14371
14372
14373#ifdef IEM_WITHOUT_ASSEMBLY
14374/**
14375 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14376 * the SoftFloat 32-bit floating point format (float32_t).
14377 *
14378 * This is only a structure format conversion, nothing else.
14379 */
14380DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14381{
14382 float32_t Tmp;
14383 Tmp.v = pr32Val->u;
14384 return Tmp;
14385}
14386
14387
14388/**
14389 * Converts from SoftFloat 32-bit floating point format (float32_t)
14390 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14391 *
14392 * This is only a structure format conversion, nothing else.
14393 */
14394DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14395{
14396 pr32Dst->u = r32XSrc.v;
14397 return pr32Dst;
14398}
14399
14400
14401/**
14402 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14403 * the SoftFloat 64-bit floating point format (float64_t).
14404 *
14405 * This is only a structure format conversion, nothing else.
14406 */
14407DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14408{
14409 float64_t Tmp;
14410 Tmp.v = pr64Val->u;
14411 return Tmp;
14412}
14413
14414
14415/**
14416 * Converts from SoftFloat 64-bit floating point format (float64_t)
14417 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14418 *
14419 * This is only a structure format conversion, nothing else.
14420 */
14421DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14422{
14423 pr64Dst->u = r64XSrc.v;
14424 return pr64Dst;
14425}
14426
14427
14428/** Initializer for the SoftFloat state structure. */
14429# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14430 { \
14431 softfloat_tininess_afterRounding, \
14432 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14433 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14434 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14435 : (uint8_t)softfloat_round_minMag, \
14436 0, \
14437 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14438 32 /* Rounding precision, not relevant for SIMD. */ \
14439 }
14440
14441
14442/**
14443 * Helper for transfering exception to MXCSR and setting the result value
14444 * accordingly.
14445 *
14446 * @returns Updated MXCSR.
14447 * @param pSoftState The SoftFloat state following the operation.
14448 * @param r32Result The result of the SoftFloat operation.
14449 * @param pr32Result Where to store the result for IEM.
14450 * @param fMxcsr The original MXCSR value.
14451 */
14452DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14453 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14454{
14455 iemFpSoftF32ToIprt(pr32Result, r32Result);
14456
14457 uint8_t fXcpt = pSoftState->exceptionFlags;
14458 if ( (fMxcsr & X86_MXCSR_FZ)
14459 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14460 {
14461 /* Underflow masked and flush to zero is set. */
14462 pr32Result->s.uFraction = 0;
14463 pr32Result->s.uExponent = 0;
14464 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14465 }
14466
14467 /* If DAZ is set \#DE is never set. */
14468 if ( fMxcsr & X86_MXCSR_DAZ
14469 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14470 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14471 fXcpt &= ~X86_MXCSR_DE;
14472
14473 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14474}
14475
14476
14477/**
14478 * Helper for transfering exception to MXCSR and setting the result value
14479 * accordingly - ignores Flush-to-Zero.
14480 *
14481 * @returns Updated MXCSR.
14482 * @param pSoftState The SoftFloat state following the operation.
14483 * @param r32Result The result of the SoftFloat operation.
14484 * @param pr32Result Where to store the result for IEM.
14485 * @param fMxcsr The original MXCSR value.
14486 */
14487DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14488 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14489{
14490 iemFpSoftF32ToIprt(pr32Result, r32Result);
14491
14492 uint8_t fXcpt = pSoftState->exceptionFlags;
14493 /* If DAZ is set \#DE is never set. */
14494 if ( fMxcsr & X86_MXCSR_DAZ
14495 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14496 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14497 fXcpt &= ~X86_MXCSR_DE;
14498
14499 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14500}
14501
14502
14503/**
14504 * Helper for transfering exception to MXCSR and setting the result value
14505 * accordingly.
14506 *
14507 * @returns Updated MXCSR.
14508 * @param pSoftState The SoftFloat state following the operation.
14509 * @param r64Result The result of the SoftFloat operation.
14510 * @param pr64Result Where to store the result for IEM.
14511 * @param fMxcsr The original MXCSR value.
14512 */
14513DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14514 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14515{
14516 iemFpSoftF64ToIprt(pr64Result, r64Result);
14517 uint8_t fXcpt = pSoftState->exceptionFlags;
14518 if ( (fMxcsr & X86_MXCSR_FZ)
14519 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14520 {
14521 /* Underflow masked and flush to zero is set. */
14522 iemFpSoftF64ToIprt(pr64Result, r64Result);
14523 pr64Result->s.uFractionHigh = 0;
14524 pr64Result->s.uFractionLow = 0;
14525 pr64Result->s.uExponent = 0;
14526 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14527 }
14528
14529 /* If DAZ is set \#DE is never set. */
14530 if ( fMxcsr & X86_MXCSR_DAZ
14531 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14532 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14533 fXcpt &= ~X86_MXCSR_DE;
14534
14535 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14536}
14537
14538
14539/**
14540 * Helper for transfering exception to MXCSR and setting the result value
14541 * accordingly - ignores Flush-to-Zero.
14542 *
14543 * @returns Updated MXCSR.
14544 * @param pSoftState The SoftFloat state following the operation.
14545 * @param r64Result The result of the SoftFloat operation.
14546 * @param pr64Result Where to store the result for IEM.
14547 * @param fMxcsr The original MXCSR value.
14548 */
14549DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14550 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14551{
14552 iemFpSoftF64ToIprt(pr64Result, r64Result);
14553
14554 uint8_t fXcpt = pSoftState->exceptionFlags;
14555 /* If DAZ is set \#DE is never set. */
14556 if ( fMxcsr & X86_MXCSR_DAZ
14557 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14558 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14559 fXcpt &= ~X86_MXCSR_DE;
14560
14561 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14562}
14563
14564
14565/**
14566 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14567 * in MXCSR into account.
14568 *
14569 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14570 * @param pr32Val Where to store the result.
14571 * @param fMxcsr The input MXCSR value.
14572 * @param pr32Src The value to use.
14573 */
14574DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14575{
14576 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14577 {
14578 if (fMxcsr & X86_MXCSR_DAZ)
14579 {
14580 /* De-normals are changed to 0. */
14581 pr32Val->s.fSign = pr32Src->s.fSign;
14582 pr32Val->s.uFraction = 0;
14583 pr32Val->s.uExponent = 0;
14584 return 0;
14585 }
14586
14587 *pr32Val = *pr32Src;
14588 return X86_MXCSR_DE;
14589 }
14590
14591 *pr32Val = *pr32Src;
14592 return 0;
14593}
14594
14595
14596/**
14597 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14598 * in MXCSR into account.
14599 *
14600 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14601 * @param pr64Val Where to store the result.
14602 * @param fMxcsr The input MXCSR value.
14603 * @param pr64Src The value to use.
14604 */
14605DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14606{
14607 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14608 {
14609 if (fMxcsr & X86_MXCSR_DAZ)
14610 {
14611 /* De-normals are changed to 0. */
14612 pr64Val->s64.fSign = pr64Src->s.fSign;
14613 pr64Val->s64.uFraction = 0;
14614 pr64Val->s64.uExponent = 0;
14615 return 0;
14616 }
14617
14618 *pr64Val = *pr64Src;
14619 return X86_MXCSR_DE;
14620 }
14621
14622 *pr64Val = *pr64Src;
14623 return 0;
14624}
14625
14626
14627/**
14628 * Validates the given input operands returning whether the operation can continue or whether one
14629 * of the source operands contains a NaN value, setting the output accordingly.
14630 *
14631 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14632 * @param pr32Res Where to store the result in case the operation can't continue.
14633 * @param pr32Val1 The first input operand.
14634 * @param pr32Val2 The second input operand.
14635 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14636 */
14637DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14638{
14639 uint8_t cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14640 uint8_t cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14641 if (cSNan + cQNan == 2)
14642 {
14643 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14644 *pr32Res = *pr32Val1;
14645 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14646 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14647 return true;
14648 }
14649 else if (cSNan)
14650 {
14651 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14652 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14653 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14654 *pfMxcsr |= X86_MXCSR_IE;
14655 return true;
14656 }
14657 else if (cQNan)
14658 {
14659 /* The QNan operand is placed into the result. */
14660 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14661 return true;
14662 }
14663
14664 Assert(!cQNan && !cSNan);
14665 return false;
14666}
14667
14668
14669/**
14670 * Validates the given double precision input operands returning whether the operation can continue or whether one
14671 * of the source operands contains a NaN value, setting the output accordingly.
14672 *
14673 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14674 * @param pr64Res Where to store the result in case the operation can't continue.
14675 * @param pr64Val1 The first input operand.
14676 * @param pr64Val2 The second input operand.
14677 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14678 */
14679DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14680{
14681 uint8_t cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14682 uint8_t cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14683 if (cSNan + cQNan == 2)
14684 {
14685 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14686 *pr64Res = *pr64Val1;
14687 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14688 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14689 return true;
14690 }
14691 else if (cSNan)
14692 {
14693 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14694 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14695 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14696 *pfMxcsr |= X86_MXCSR_IE;
14697 return true;
14698 }
14699 else if (cQNan)
14700 {
14701 /* The QNan operand is placed into the result. */
14702 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14703 return true;
14704 }
14705
14706 Assert(!cQNan && !cSNan);
14707 return false;
14708}
14709
14710
14711/**
14712 * Validates the given single input operand returning whether the operation can continue or whether
14713 * contains a NaN value, setting the output accordingly.
14714 *
14715 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14716 * @param pr32Res Where to store the result in case the operation can't continue.
14717 * @param pr32Val The input operand.
14718 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14719 */
14720DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
14721{
14722 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
14723 {
14724 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14725 *pr32Res = *pr32Val;
14726 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14727 *pfMxcsr |= X86_MXCSR_IE;
14728 return true;
14729 }
14730 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
14731 {
14732 /* The QNan operand is placed into the result. */
14733 *pr32Res = *pr32Val;
14734 return true;
14735 }
14736
14737 return false;
14738}
14739
14740
14741/**
14742 * Validates the given double input operand returning whether the operation can continue or whether
14743 * contains a NaN value, setting the output accordingly.
14744 *
14745 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
14746 * @param pr64Res Where to store the result in case the operation can't continue.
14747 * @param pr64Val The input operand.
14748 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14749 */
14750DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
14751{
14752 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
14753 {
14754 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14755 *pr64Res = *pr64Val;
14756 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14757 *pfMxcsr |= X86_MXCSR_IE;
14758 return true;
14759 }
14760 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
14761 {
14762 /* The QNan operand is placed into the result. */
14763 *pr64Res = *pr64Val;
14764 return true;
14765 }
14766
14767 return false;
14768}
14769#endif
14770
14771
14772/**
14773 * ADDPS
14774 */
14775#ifdef IEM_WITHOUT_ASSEMBLY
14776static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14777{
14778 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14779 return fMxcsr;
14780
14781 RTFLOAT32U r32Src1, r32Src2;
14782 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14783 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14785 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14786 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14787}
14788
14789
14790IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14791{
14792 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14793 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14794 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14795 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14796}
14797#endif
14798
14799
14800/**
14801 * ADDSS
14802 */
14803#ifdef IEM_WITHOUT_ASSEMBLY
14804IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14805{
14806 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14807 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14808 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14809 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14810}
14811#endif
14812
14813
14814/**
14815 * ADDPD
14816 */
14817#ifdef IEM_WITHOUT_ASSEMBLY
14818static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14819{
14820 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14821 return fMxcsr;
14822
14823 RTFLOAT64U r64Src1, r64Src2;
14824 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14825 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14826 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14827 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14828 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14829}
14830
14831
14832IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14833{
14834 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14835 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14836}
14837#endif
14838
14839
14840/**
14841 * ADDSD
14842 */
14843#ifdef IEM_WITHOUT_ASSEMBLY
14844IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14845{
14846 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14847 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14848}
14849#endif
14850
14851
14852/**
14853 * MULPS
14854 */
14855#ifdef IEM_WITHOUT_ASSEMBLY
14856static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14857{
14858 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14859 return fMxcsr;
14860
14861 RTFLOAT32U r32Src1, r32Src2;
14862 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14863 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14864 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14865 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14866 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14867}
14868
14869
14870IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14871{
14872 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14873 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14874 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14875 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14876}
14877#endif
14878
14879
14880/**
14881 * MULSS
14882 */
14883#ifdef IEM_WITHOUT_ASSEMBLY
14884IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14885{
14886 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14887 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14888 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14889 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14890}
14891#endif
14892
14893
14894/**
14895 * MULPD
14896 */
14897#ifdef IEM_WITHOUT_ASSEMBLY
14898static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14899{
14900 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14901 return fMxcsr;
14902
14903 RTFLOAT64U r64Src1, r64Src2;
14904 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14905 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14906 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14907 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14908 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14909}
14910
14911
14912IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14913{
14914 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14915 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14916}
14917#endif
14918
14919
14920/**
14921 * MULSD
14922 */
14923#ifdef IEM_WITHOUT_ASSEMBLY
14924IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
14925{
14926 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
14927 pResult->uResult.ar64[1] = puSrc1->ar64[1];
14928}
14929#endif
14930
14931
14932/**
14933 * SUBPS
14934 */
14935#ifdef IEM_WITHOUT_ASSEMBLY
14936static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
14937{
14938 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
14939 return fMxcsr;
14940
14941 RTFLOAT32U r32Src1, r32Src2;
14942 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
14943 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
14944 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14945 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
14946 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
14947}
14948
14949
14950IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14951{
14952 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
14953 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
14954 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
14955 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
14956}
14957#endif
14958
14959
14960/**
14961 * SUBSS
14962 */
14963#ifdef IEM_WITHOUT_ASSEMBLY
14964IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
14965{
14966 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
14967 pResult->uResult.ar32[1] = puSrc1->ar32[1];
14968 pResult->uResult.ar32[2] = puSrc1->ar32[2];
14969 pResult->uResult.ar32[3] = puSrc1->ar32[3];
14970}
14971#endif
14972
14973
14974/**
14975 * SUBPD
14976 */
14977#ifdef IEM_WITHOUT_ASSEMBLY
14978static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
14979{
14980 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
14981 return fMxcsr;
14982
14983 RTFLOAT64U r64Src1, r64Src2;
14984 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
14985 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
14986 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
14987 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
14988 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
14989}
14990
14991
14992IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
14993{
14994 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
14995 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
14996}
14997#endif
14998
14999
15000/**
15001 * SUBSD
15002 */
15003#ifdef IEM_WITHOUT_ASSEMBLY
15004IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15005{
15006 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15007 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15008}
15009#endif
15010
15011
15012/**
15013 * MINPS
15014 */
15015#ifdef IEM_WITHOUT_ASSEMBLY
15016static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15017{
15018 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15019 {
15020 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15021 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15022 return fMxcsr | X86_MXCSR_IE;
15023 }
15024
15025 RTFLOAT32U r32Src1, r32Src2;
15026 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15027 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15028 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15029 {
15030 *pr32Res = r32Src2;
15031 return fMxcsr;
15032 }
15033
15034 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15035 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15036 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15037 fLe
15038 ? iemFpSoftF32FromIprt(&r32Src1)
15039 : iemFpSoftF32FromIprt(&r32Src2),
15040 pr32Res, fMxcsr);
15041}
15042
15043
15044IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15045{
15046 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15047 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15048 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15049 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15050}
15051#endif
15052
15053
15054/**
15055 * MINSS
15056 */
15057#ifdef IEM_WITHOUT_ASSEMBLY
15058IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15059{
15060 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15061 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15062 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15063 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15064}
15065#endif
15066
15067
15068/**
15069 * MINPD
15070 */
15071#ifdef IEM_WITHOUT_ASSEMBLY
15072static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15073{
15074 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15075 {
15076 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15077 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15078 return fMxcsr | X86_MXCSR_IE;
15079 }
15080
15081 RTFLOAT64U r64Src1, r64Src2;
15082 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15083 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15084 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15085 {
15086 *pr64Res = r64Src2;
15087 return fMxcsr;
15088 }
15089
15090 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15091 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15092 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15093 fLe
15094 ? iemFpSoftF64FromIprt(&r64Src1)
15095 : iemFpSoftF64FromIprt(&r64Src2),
15096 pr64Res, fMxcsr);
15097}
15098
15099
15100IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15101{
15102 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15103 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15104}
15105#endif
15106
15107
15108/**
15109 * MINSD
15110 */
15111#ifdef IEM_WITHOUT_ASSEMBLY
15112IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15113{
15114 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15115 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15116}
15117#endif
15118
15119
15120/**
15121 * DIVPS
15122 */
15123#ifdef IEM_WITHOUT_ASSEMBLY
15124static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15125{
15126 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15127 return fMxcsr;
15128
15129 RTFLOAT32U r32Src1, r32Src2;
15130 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15131 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15132 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15133 {
15134 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15135 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15136 {
15137 *pr32Res = g_ar32QNaN[1];
15138 return fMxcsr | X86_MXCSR_IE;
15139 }
15140 else if (RTFLOAT32U_IS_INF(&r32Src1))
15141 {
15142 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15143 return fMxcsr;
15144 }
15145 else
15146 {
15147 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15148 return fMxcsr | X86_MXCSR_ZE;
15149 }
15150 }
15151
15152 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15153 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15154 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15155}
15156
15157
15158IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15159{
15160 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15161 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15162 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15163 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15164}
15165#endif
15166
15167
15168/**
15169 * DIVSS
15170 */
15171#ifdef IEM_WITHOUT_ASSEMBLY
15172IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15173{
15174 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15175 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15176 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15177 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15178}
15179#endif
15180
15181
15182/**
15183 * DIVPD
15184 */
15185#ifdef IEM_WITHOUT_ASSEMBLY
15186static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15187{
15188 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15189 return fMxcsr;
15190
15191 RTFLOAT64U r64Src1, r64Src2;
15192 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15193 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15194 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15195 {
15196 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15197 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15198 {
15199 *pr64Res = g_ar64QNaN[1];
15200 return fMxcsr | X86_MXCSR_IE;
15201 }
15202 else if (RTFLOAT64U_IS_INF(&r64Src1))
15203 {
15204 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15205 return fMxcsr;
15206 }
15207 else
15208 {
15209 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15210 return fMxcsr | X86_MXCSR_ZE;
15211 }
15212 }
15213
15214 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15215 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15216 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15217}
15218
15219
15220IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15221{
15222 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15223 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15224}
15225#endif
15226
15227
15228/**
15229 * DIVSD
15230 */
15231#ifdef IEM_WITHOUT_ASSEMBLY
15232IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15233{
15234 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15235 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15236}
15237#endif
15238
15239
15240/**
15241 * MAXPS
15242 */
15243#ifdef IEM_WITHOUT_ASSEMBLY
15244static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15245{
15246 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15247 {
15248 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15249 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15250 return fMxcsr | X86_MXCSR_IE;
15251 }
15252
15253 RTFLOAT32U r32Src1, r32Src2;
15254 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15255 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15256 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15257 {
15258 *pr32Res = r32Src2;
15259 return fMxcsr;
15260 }
15261
15262 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15263 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15264 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15265 fLe
15266 ? iemFpSoftF32FromIprt(&r32Src2)
15267 : iemFpSoftF32FromIprt(&r32Src1),
15268 pr32Res, fMxcsr);
15269}
15270
15271
15272IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15273{
15274 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15275 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15276 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15277 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15278}
15279#endif
15280
15281
15282/**
15283 * MAXSS
15284 */
15285#ifdef IEM_WITHOUT_ASSEMBLY
15286IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15287{
15288 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15289 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15290 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15291 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15292}
15293#endif
15294
15295
15296/**
15297 * MAXPD
15298 */
15299#ifdef IEM_WITHOUT_ASSEMBLY
15300static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15301{
15302 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15303 {
15304 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15305 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15306 return fMxcsr | X86_MXCSR_IE;
15307 }
15308
15309 RTFLOAT64U r64Src1, r64Src2;
15310 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15311 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15312 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15313 {
15314 *pr64Res = r64Src2;
15315 return fMxcsr;
15316 }
15317
15318 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15319 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15320 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15321 fLe
15322 ? iemFpSoftF64FromIprt(&r64Src2)
15323 : iemFpSoftF64FromIprt(&r64Src1),
15324 pr64Res, fMxcsr);
15325}
15326
15327
15328IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15329{
15330 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15331 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15332}
15333#endif
15334
15335
15336/**
15337 * MAXSD
15338 */
15339#ifdef IEM_WITHOUT_ASSEMBLY
15340IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15341{
15342 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15343 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15344}
15345#endif
15346
15347
15348/**
15349 * CVTSS2SD
15350 */
15351#ifdef IEM_WITHOUT_ASSEMBLY
15352static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15353{
15354 RTFLOAT32U r32Src1;
15355 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15356
15357 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15358 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15359 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15360}
15361
15362
15363IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15364{
15365 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15366 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15367}
15368#endif
15369
15370
15371/**
15372 * CVTSD2SS
15373 */
15374#ifdef IEM_WITHOUT_ASSEMBLY
15375static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15376{
15377 RTFLOAT64U r64Src1;
15378 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15379
15380 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15381 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15382 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15383}
15384
15385
15386IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15387{
15388 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15389 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15390 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15391 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15392}
15393#endif
15394
15395
15396/**
15397 * HADDPS
15398 */
15399#ifdef IEM_WITHOUT_ASSEMBLY
15400IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15401{
15402 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15403 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15404 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15405 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15406}
15407#endif
15408
15409
15410/**
15411 * HADDPD
15412 */
15413#ifdef IEM_WITHOUT_ASSEMBLY
15414IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15415{
15416 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15417 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15418}
15419#endif
15420
15421
15422/**
15423 * HSUBPS
15424 */
15425#ifdef IEM_WITHOUT_ASSEMBLY
15426IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15427{
15428 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15429 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15430 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15431 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15432}
15433#endif
15434
15435
15436/**
15437 * HSUBPD
15438 */
15439#ifdef IEM_WITHOUT_ASSEMBLY
15440IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15441{
15442 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15443 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15444}
15445#endif
15446
15447
15448/**
15449 * SQRTPS
15450 */
15451#ifdef IEM_WITHOUT_ASSEMBLY
15452static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15453{
15454 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15455 return fMxcsr;
15456
15457 RTFLOAT32U r32Src;
15458 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15459 if (RTFLOAT32U_IS_ZERO(&r32Src))
15460 {
15461 *pr32Res = r32Src;
15462 return fMxcsr;
15463 }
15464 else if (r32Src.s.fSign)
15465 {
15466 *pr32Res = g_ar32QNaN[1];
15467 return fMxcsr | X86_MXCSR_IE;
15468 }
15469
15470 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15471 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15472 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15473}
15474
15475
15476IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15477{
15478 RT_NOREF(puSrc1);
15479
15480 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15481 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15482 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15483 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15484}
15485#endif
15486
15487
15488/**
15489 * SQRTSS
15490 */
15491#ifdef IEM_WITHOUT_ASSEMBLY
15492IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15493{
15494 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15495 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15496 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15497 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15498}
15499#endif
15500
15501
15502/**
15503 * SQRTPD
15504 */
15505#ifdef IEM_WITHOUT_ASSEMBLY
15506static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15507{
15508 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15509 return fMxcsr;
15510
15511 RTFLOAT64U r64Src;
15512 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15513 if (RTFLOAT64U_IS_ZERO(&r64Src))
15514 {
15515 *pr64Res = r64Src;
15516 return fMxcsr;
15517 }
15518 else if (r64Src.s.fSign)
15519 {
15520 *pr64Res = g_ar64QNaN[1];
15521 return fMxcsr | X86_MXCSR_IE;
15522 }
15523
15524 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15525 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15526 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15527}
15528
15529
15530IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15531{
15532 RT_NOREF(puSrc1);
15533
15534 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15535 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15536}
15537#endif
15538
15539
15540/**
15541 * SQRTSD
15542 */
15543#ifdef IEM_WITHOUT_ASSEMBLY
15544IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15545{
15546 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15547 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15548}
15549#endif
15550
15551
15552/**
15553 * ADDSUBPS
15554 */
15555#ifdef IEM_WITHOUT_ASSEMBLY
15556IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15557{
15558 RT_NOREF(puSrc1);
15559
15560 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15561 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15562 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15563 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15564}
15565#endif
15566
15567
15568/**
15569 * ADDSUBPD
15570 */
15571#ifdef IEM_WITHOUT_ASSEMBLY
15572IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15573{
15574 RT_NOREF(puSrc1);
15575
15576 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15577 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15578}
15579#endif
15580
15581
15582/**
15583 * CVTPD2PS
15584 */
15585#ifdef IEM_WITHOUT_ASSEMBLY
15586static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15587{
15588 RTFLOAT64U r64Src1;
15589 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15590
15591 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15592 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15593 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15594}
15595
15596
15597IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15598{
15599 RT_NOREF(puSrc1);
15600
15601 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15602 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15603 pResult->uResult.au32[2] = 0;
15604 pResult->uResult.au32[3] = 0;
15605}
15606#endif
15607
15608
15609/**
15610 * CVTPS2PD
15611 */
15612#ifdef IEM_WITHOUT_ASSEMBLY
15613static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15614{
15615 RTFLOAT32U r32Src1;
15616 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15617
15618 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15619 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15620 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15621}
15622
15623
15624IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15625{
15626 RT_NOREF(puSrc1);
15627
15628 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15629 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15630}
15631#endif
15632
15633
15634/**
15635 * CVTDQ2PS
15636 */
15637#ifdef IEM_WITHOUT_ASSEMBLY
15638static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
15639{
15640 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15641 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
15642 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15643}
15644
15645
15646IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15647{
15648 RT_NOREF(puSrc1);
15649
15650 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15651 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15652 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
15653 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
15654}
15655#endif
15656
15657
15658/**
15659 * CVTPS2DQ
15660 */
15661#ifdef IEM_WITHOUT_ASSEMBLY
15662static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15663{
15664 RTFLOAT32U r32Src;
15665 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15666
15667 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15668 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15669 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15670}
15671
15672
15673IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15674{
15675 RT_NOREF(puSrc1);
15676
15677 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15678 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15679 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15680 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15681}
15682#endif
15683
15684
15685/**
15686 * CVTTPS2DQ
15687 */
15688#ifdef IEM_WITHOUT_ASSEMBLY
15689static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
15690{
15691 RTFLOAT32U r32Src;
15692 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
15693
15694 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15695 SoftState.roundingMode = softfloat_round_minMag;
15696 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
15697 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15698}
15699
15700
15701IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15702{
15703 RT_NOREF(puSrc1);
15704
15705 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15706 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15707 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15708 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15709}
15710#endif
15711
15712
15713/**
15714 * CVTTPD2DQ
15715 */
15716#ifdef IEM_WITHOUT_ASSEMBLY
15717static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15718{
15719 RTFLOAT64U r64Src;
15720 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15721
15722 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15723 SoftState.roundingMode = softfloat_round_minMag;
15724 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15725 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15726}
15727
15728
15729IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15730{
15731 RT_NOREF(puSrc1);
15732
15733 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15734 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15735 pResult->uResult.au64[1] = 0;
15736}
15737#endif
15738
15739
15740/**
15741 * CVTDQ2PD
15742 */
15743#ifdef IEM_WITHOUT_ASSEMBLY
15744static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
15745{
15746 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15747 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
15748 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15749}
15750
15751
15752IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15753{
15754 RT_NOREF(puSrc1);
15755
15756 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
15757 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
15758}
15759#endif
15760
15761
15762/**
15763 * CVTPD2DQ
15764 */
15765#ifdef IEM_WITHOUT_ASSEMBLY
15766static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
15767{
15768 RTFLOAT64U r64Src;
15769 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
15770
15771 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15772 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
15773 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
15774}
15775
15776
15777IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15778{
15779 RT_NOREF(puSrc1);
15780
15781 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15782 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15783 pResult->uResult.au64[1] = 0;
15784}
15785#endif
15786
15787
15788/**
15789 * [V]SHUFPS
15790 */
15791#ifdef IEM_WITHOUT_ASSEMBLY
15792IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15793{
15794 RTUINT128U const uSrc1 = *puDst;
15795 RTUINT128U const uSrc2 = *puSrc;
15796 ASMCompilerBarrier();
15797 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15798 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15799 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15800 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15801}
15802#endif
15803
15804
15805IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15806{
15807 RTUINT128U const uSrc1 = *puSrc1;
15808 RTUINT128U const uSrc2 = *puSrc2;
15809 ASMCompilerBarrier();
15810 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15811 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15812 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15813 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15814}
15815
15816
15817IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15818{
15819 RTUINT256U const uSrc1 = *puSrc1;
15820 RTUINT256U const uSrc2 = *puSrc2;
15821 ASMCompilerBarrier();
15822 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
15823 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
15824 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
15825 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
15826
15827 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
15828 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
15829 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
15830 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
15831}
15832
15833
15834/**
15835 * [V]SHUFPD
15836 */
15837#ifdef IEM_WITHOUT_ASSEMBLY
15838IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15839{
15840 RTUINT128U const uSrc1 = *puDst;
15841 RTUINT128U const uSrc2 = *puSrc;
15842 ASMCompilerBarrier();
15843 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15844 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15845}
15846#endif
15847
15848
15849IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
15850{
15851 RTUINT128U const uSrc1 = *puSrc1;
15852 RTUINT128U const uSrc2 = *puSrc2;
15853 ASMCompilerBarrier();
15854 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15855 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15856}
15857
15858
15859IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
15860{
15861 RTUINT256U const uSrc1 = *puSrc1;
15862 RTUINT256U const uSrc2 = *puSrc2;
15863 ASMCompilerBarrier();
15864 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
15865 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
15866 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
15867 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
15868}
15869
15870
15871/*
15872 * PHMINPOSUW / VPHMINPOSUW
15873 */
15874IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15875{
15876 uint16_t u16Min = puSrc->au16[0];
15877 uint8_t idxMin = 0;
15878
15879 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
15880 if (puSrc->au16[i] < u16Min)
15881 {
15882 u16Min = puSrc->au16[i];
15883 idxMin = i;
15884 }
15885
15886 puDst->au64[0] = 0;
15887 puDst->au64[1] = 0;
15888 puDst->au16[0] = u16Min;
15889 puDst->au16[1] = idxMin;
15890}
15891
15892
15893IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
15894{
15895 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
15896}
15897
15898
15899/*
15900 * [V]PBLENDVB
15901 */
15902IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15903{
15904 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15905 if (puMask->au8[i] & RT_BIT(7))
15906 puDst->au8[i] = puSrc->au8[i];
15907}
15908
15909
15910IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15911{
15912 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15913 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15914}
15915
15916
15917IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15918{
15919 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
15920 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
15921}
15922
15923
15924/*
15925 * [V]BLENDVPS
15926 */
15927IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15928{
15929 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15930 if (puMask->au32[i] & RT_BIT_32(31))
15931 puDst->au32[i] = puSrc->au32[i];
15932}
15933
15934
15935IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15936{
15937 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15938 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15939}
15940
15941
15942IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15943{
15944 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
15945 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
15946}
15947
15948
15949/*
15950 * [V]BLENDVPD
15951 */
15952IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
15953{
15954 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
15955 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
15956}
15957
15958
15959IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
15960{
15961 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15962 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15963}
15964
15965
15966IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
15967{
15968 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
15969 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
15970}
15971
15972
15973/**
15974 * [V]PALIGNR
15975 */
15976IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
15977{
15978 uint64_t const u64Src1 = *pu64Dst;
15979 ASMCompilerBarrier();
15980
15981 if (bEvil >= 16)
15982 *pu64Dst = 0;
15983 else if (bEvil >= 8)
15984 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
15985 else
15986 {
15987 uint8_t cShift = bEvil * 8;
15988 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
15989 | (u64Src2 >> cShift);
15990 }
15991}
15992
15993
15994IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
15995{
15996 RTUINT128U const uSrc1 = *puDst;
15997 RTUINT128U const uSrc2 = *puSrc;
15998 ASMCompilerBarrier();
15999
16000 puDst->au64[0] = 0;
16001 puDst->au64[1] = 0;
16002 if (bEvil >= 32)
16003 { /* Everything stays 0. */ }
16004 else if (bEvil >= 16)
16005 {
16006 bEvil -= 16;
16007 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16008 puDst->au8[i - bEvil] = uSrc1.au8[i];
16009 }
16010 else
16011 {
16012 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16013 puDst->au8[i] = uSrc2.au8[i + bEvil];
16014 for (uint8_t i = 0; i < bEvil; i++)
16015 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16016 }
16017}
16018
16019
16020IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16021{
16022 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16023 RTUINT128U const uSrc2 = *puSrc2;
16024 ASMCompilerBarrier();
16025
16026 puDst->au64[0] = 0;
16027 puDst->au64[1] = 0;
16028 if (bEvil >= 32)
16029 { /* Everything stays 0. */ }
16030 else if (bEvil >= 16)
16031 {
16032 bEvil -= 16;
16033 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16034 puDst->au8[i - bEvil] = uSrc1.au8[i];
16035 }
16036 else
16037 {
16038 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16039 puDst->au8[i] = uSrc2.au8[i + bEvil];
16040 for (uint8_t i = 0; i < bEvil; i++)
16041 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16042 }
16043}
16044
16045
16046IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16047{
16048 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16049 RTUINT256U const uSrc2 = *puSrc2;
16050 ASMCompilerBarrier();
16051
16052 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16053 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16054}
16055
16056
16057/**
16058 * [V]PBLENDW
16059 */
16060IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16061{
16062 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16063 if (bEvil & RT_BIT(i))
16064 puDst->au16[i] = puSrc->au16[i];
16065}
16066
16067
16068IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16069{
16070 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16071 if (bEvil & RT_BIT(i))
16072 puDst->au16[i] = puSrc2->au16[i];
16073 else
16074 puDst->au16[i] = puSrc1->au16[i];
16075}
16076
16077
16078IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16079{
16080 for (uint8_t i = 0; i < 8; i++)
16081 if (bEvil & RT_BIT(i))
16082 {
16083 puDst->au16[ i] = puSrc2->au16[ i];
16084 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16085 }
16086 else
16087 {
16088 puDst->au16[ i] = puSrc1->au16[ i];
16089 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16090 }
16091}
16092
16093
16094/**
16095 * [V]BLENDPS
16096 */
16097IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16098{
16099 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16100 if (bEvil & RT_BIT(i))
16101 puDst->au32[i] = puSrc->au32[i];
16102}
16103
16104
16105IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16106{
16107 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16108 if (bEvil & RT_BIT(i))
16109 puDst->au32[i] = puSrc2->au32[i];
16110 else
16111 puDst->au32[i] = puSrc1->au32[i];
16112}
16113
16114
16115IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16116{
16117 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16118 if (bEvil & RT_BIT(i))
16119 puDst->au32[i] = puSrc2->au32[i];
16120 else
16121 puDst->au32[i] = puSrc1->au32[i];
16122}
16123
16124
16125/**
16126 * [V]BLENDPD
16127 */
16128IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16129{
16130 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16131 if (bEvil & RT_BIT(i))
16132 puDst->au64[i] = puSrc->au64[i];
16133}
16134
16135
16136IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16137{
16138 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16139 if (bEvil & RT_BIT(i))
16140 puDst->au64[i] = puSrc2->au64[i];
16141 else
16142 puDst->au64[i] = puSrc1->au64[i];
16143}
16144
16145
16146IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16147{
16148 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16149 if (bEvil & RT_BIT(i))
16150 puDst->au64[i] = puSrc2->au64[i];
16151 else
16152 puDst->au64[i] = puSrc1->au64[i];
16153}
16154
16155
16156/**
16157 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16158 */
16159
16160static uint8_t iemAImpl_aes_sbox[] = {
16161 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16162 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16163 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16164 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16165 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16166 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16167 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16168 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16169 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16170 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
16171 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16172 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
16173 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
16174 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
16175 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
16176 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
16177};
16178
16179/* The InvS-Box lookup table. */
16180static uint8_t iemAImpl_aes_inv_sbox[] = {
16181 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
16182 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
16183 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
16184 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
16185 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
16186 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
16187 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
16188 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
16189 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
16190 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
16191 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
16192 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
16193 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
16194 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
16195 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
16196 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
16197};
16198
16199/* The ShiftRows lookup table. */
16200static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
16201 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
16202};
16203
16204/* The InvShiftRows lookup table. */
16205static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
16206 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
16207};
16208
16209static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
16210{
16211 RTUINT128U uVal;
16212 int i;
16213
16214 for (i = 0; i < 16; ++i)
16215 uVal.au8[i] = abSubst[puSrc->au8[i]];
16216
16217 return uVal;
16218}
16219
16220static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
16221{
16222 return (u << 1) ^ (((u >> 7) & 1) * 27);
16223}
16224
16225static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
16226{
16227 RTUINT128U uVal;
16228 int i;
16229 uint8_t tmp;
16230
16231 for (i = 0; i < 16; i += 4) {
16232 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
16233 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
16234 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
16235 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
16236 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
16237 }
16238
16239 return uVal;
16240}
16241
16242static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
16243{
16244 RTUINT128U uVal;
16245 int i;
16246
16247 for (i = 0; i < 16; ++i)
16248 uVal.au8[i] = puSrc->au8[abShift[i]];
16249
16250 return uVal;
16251}
16252
16253static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
16254{
16255 uint8_t val;
16256
16257 val = ((b >> 0) & 1) * a;
16258 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
16259 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
16260 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
16261 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
16262
16263 return val;
16264}
16265
16266static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
16267{
16268 RTUINT128U uVal;
16269 int i;
16270
16271 for (i = 0; i < 16; i += 4) {
16272 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
16273 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
16274 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
16275 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
16276 }
16277
16278 return uVal;
16279}
16280
16281static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
16282{
16283 RTUINT32U uTmp;
16284
16285 uTmp.au32[0] = w;
16286 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
16287 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
16288 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
16289 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
16290
16291 return uTmp.au32[0];
16292}
16293
16294static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
16295{
16296 return (w << 24) | (w >> 8);
16297}
16298
16299/**
16300 * [V]AESKEYGENASSIST
16301 */
16302IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
16303{
16304 RTUINT128U uTmp;
16305 uint32_t uRCon = bImm; /* Round constant. */
16306
16307 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
16308 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
16309 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
16310 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
16311
16312 *puDst = uTmp;
16313}
16314
16315
16316/**
16317 * [V]AESIMC
16318 */
16319IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16320{
16321 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
16322}
16323
16324
16325/**
16326 * [V]AESENC
16327 */
16328IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16329{
16330 RTUINT128U uTmp;
16331
16332 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16333 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16334 uTmp = iemAImpl_aes_mix_col(&uTmp);
16335 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16336 uTmp.au64[1] ^= puSrc->au64[1];
16337
16338 *puDst = uTmp;
16339}
16340
16341
16342/**
16343 * [V]AESENCLAST
16344 */
16345IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16346{
16347 RTUINT128U uTmp;
16348
16349 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16350 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16351 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16352 uTmp.au64[1] ^= puSrc->au64[1];
16353
16354 *puDst = uTmp;
16355}
16356
16357
16358/**
16359 * [V]AESDEC
16360 */
16361IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16362{
16363 RTUINT128U uTmp;
16364
16365 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16366 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16367 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
16368 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16369 uTmp.au64[1] ^= puSrc->au64[1];
16370
16371 *puDst = uTmp;
16372}
16373
16374
16375/**
16376 * [V]AESDECLAST
16377 */
16378IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16379{
16380 RTUINT128U uTmp;
16381
16382 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16383 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16384 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16385 uTmp.au64[1] ^= puSrc->au64[1];
16386
16387 *puDst = uTmp;
16388}
16389
16390
16391/**
16392 * [V]PCMPISTRI
16393 */
16394IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRISRC pSrc, uint8_t bEvil))
16395{
16396 RT_NOREF(pu32Ecx, pEFlags, pSrc, bEvil);
16397 AssertReleaseFailed();
16398}
16399
16400
16401/*
16402 * [V]PCLMULQDQ
16403 */
16404IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16405{
16406 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
16407}
16408
16409
16410IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16411{
16412 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
16413 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
16414
16415 puDst->au64[0] = 0;
16416 puDst->au64[1] = 0;
16417
16418 /*
16419 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
16420 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
16421 * and squeeze out some optimizations.
16422 */
16423 if (uSrc1 & 0x1)
16424 puDst->au64[0] = uSrc2;
16425
16426 uSrc1 >>= 1;
16427
16428 uint8_t iDigit = 1;
16429 while (uSrc1)
16430 {
16431 if (uSrc1 & 0x1)
16432 {
16433 puDst->au64[0] ^= (uSrc2 << iDigit);
16434 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
16435 }
16436
16437 uSrc1 >>= 1;
16438 iDigit++;
16439 }
16440}
16441
16442
16443/**
16444 * [V]PINSRW
16445 */
16446#ifdef IEM_WITHOUT_ASSEMBLY
16447IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
16448{
16449 uint8_t cShift = (bEvil & 0x3) * 16;
16450 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
16451}
16452
16453
16454IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
16455{
16456 puDst->au16[bEvil & 0x7] = u16Src;
16457}
16458#endif
16459
16460
16461IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
16462{
16463 *puDst = *puSrc;
16464 puDst->au16[bEvil & 0x7] = u16Src;
16465}
16466
16467
16468/**
16469 * [V]PEXTRW
16470 */
16471#ifdef IEM_WITHOUT_ASSEMBLY
16472IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
16473{
16474 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
16475}
16476
16477
16478IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16479{
16480 *pu16Dst = puSrc->au16[bEvil & 0x7];
16481}
16482
16483#endif
16484
16485IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
16486{
16487 *pu16Dst = puSrc->au16[bEvil & 0x7];
16488}
16489
16490
16491/**
16492 * [V]MOVMSKPS
16493 */
16494#ifdef IEM_WITHOUT_ASSEMBLY
16495IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16496{
16497 *pu8Dst = puSrc->au32[0] >> 31;
16498 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16499 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16500 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16501}
16502
16503#endif
16504
16505IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16506{
16507 *pu8Dst = puSrc->au32[0] >> 31;
16508 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16509 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16510 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16511}
16512
16513
16514IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16515{
16516 *pu8Dst = puSrc->au32[0] >> 31;
16517 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
16518 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
16519 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
16520 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
16521 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
16522 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
16523 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
16524}
16525
16526
16527/**
16528 * [V]MOVMSKPD
16529 */
16530#ifdef IEM_WITHOUT_ASSEMBLY
16531IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16532{
16533 *pu8Dst = puSrc->au64[0] >> 63;
16534 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16535}
16536
16537#endif
16538
16539IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
16540{
16541 *pu8Dst = puSrc->au64[0] >> 63;
16542 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16543}
16544
16545
16546IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
16547{
16548 *pu8Dst = puSrc->au64[0] >> 63;
16549 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
16550 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
16551 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
16552}
16553
16554
16555/**
16556 * CVTTSD2SI
16557 */
16558#ifdef IEM_WITHOUT_ASSEMBLY
16559IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16560{
16561 RTFLOAT64U r64Src;
16562
16563 r64Src.u = *pu64Src;
16564 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16565
16566 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16567 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16568 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16569}
16570
16571
16572IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16573{
16574 RTFLOAT64U r64Src;
16575
16576 r64Src.u = *pu64Src;
16577 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16578
16579 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16580 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
16581 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16582}
16583#endif
16584
16585
16586/**
16587 * CVTSD2SI
16588 */
16589#ifdef IEM_WITHOUT_ASSEMBLY
16590IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
16591{
16592 RTFLOAT64U r64Src;
16593
16594 r64Src.u = *pu64Src;
16595 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16596
16597 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16598 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16599 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16600}
16601
16602
16603IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
16604{
16605 RTFLOAT64U r64Src;
16606
16607 r64Src.u = *pu64Src;
16608 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
16609
16610 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16611 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16612 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16613}
16614#endif
16615
16616
16617/**
16618 * CVTTSS2SI
16619 */
16620#ifdef IEM_WITHOUT_ASSEMBLY
16621IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
16622{
16623 RTFLOAT32U r32Src;
16624
16625 r32Src.u = *pu32Src;
16626 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16627
16628 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16629 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16630 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16631}
16632
16633
16634IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
16635{
16636 RTFLOAT32U r32Src;
16637
16638 r32Src.u = *pu32Src;
16639 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16640
16641 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16642 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16643 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16644}
16645#endif
16646
16647
16648/**
16649 * CVTSS2SI
16650 */
16651#ifdef IEM_WITHOUT_ASSEMBLY
16652IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
16653{
16654 RTFLOAT32U r32Src;
16655
16656 r32Src.u = *pu32Src;
16657 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16658
16659 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16660 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16661 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16662}
16663
16664
16665IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
16666{
16667 RTFLOAT32U r32Src;
16668
16669 r32Src.u = *pu32Src;
16670 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
16671
16672 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16673 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16674 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16675}
16676#endif
16677
16678
16679/**
16680 * CVTSI2SD
16681 */
16682#ifdef IEM_WITHOUT_ASSEMBLY
16683IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
16684{
16685 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16686 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
16687 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
16688}
16689
16690
16691IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
16692{
16693 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16694 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
16695 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
16696}
16697#endif
16698
16699
16700/**
16701 * CVTSI2SS
16702 */
16703#ifdef IEM_WITHOUT_ASSEMBLY
16704IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
16705{
16706 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16707 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
16708 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
16709}
16710
16711
16712IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
16713{
16714 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
16715 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
16716 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
16717}
16718#endif
16719
16720
16721/**
16722 * [V]UCOMISS
16723 */
16724#ifdef IEM_WITHOUT_ASSEMBLY
16725IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16726{
16727 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16728
16729 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
16730 {
16731 *pfMxcsr |= X86_MXCSR_IE;
16732 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16733 }
16734 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
16735 {
16736 /* ucomiss doesn't raise \#IE for quiet NaNs. */
16737 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16738 }
16739 else
16740 {
16741 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16742
16743 RTFLOAT32U r32Src1, r32Src2;
16744 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
16745 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
16746
16747 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16748 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16749 if (f32_eq(f32Src1, f32Src2, &SoftState))
16750 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16751 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16752 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16753 /* else: GREATER_THAN 000 */
16754
16755 *pfMxcsr |= fDe;
16756 }
16757
16758 *pfEFlags = fEFlagsNew;
16759}
16760#endif
16761
16762IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16763{
16764 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16765}
16766
16767
16768/**
16769 * [V]UCOMISD
16770 */
16771#ifdef IEM_WITHOUT_ASSEMBLY
16772IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16773{
16774 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16775
16776 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
16777 {
16778 *pfMxcsr |= X86_MXCSR_IE;
16779 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16780 }
16781 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
16782 {
16783 /* ucomiss doesn't raise \#IE for quiet NaNs. */
16784 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16785 }
16786 else
16787 {
16788 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16789
16790 RTFLOAT64U r64Src1, r64Src2;
16791 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
16792 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
16793
16794 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16795 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16796 if (f64_eq(f64Src1, f64Src2, &SoftState))
16797 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16798 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16799 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16800 /* else: GREATER_THAN 000 */
16801
16802 *pfMxcsr |= fDe;
16803 }
16804
16805 *pfEFlags = fEFlagsNew;
16806}
16807#endif
16808
16809IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16810{
16811 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16812}
16813
16814
16815/**
16816 * [V]COMISS
16817 */
16818#ifdef IEM_WITHOUT_ASSEMBLY
16819IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16820{
16821 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16822
16823 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
16824 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
16825 {
16826 *pfMxcsr |= X86_MXCSR_IE;
16827 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16828 }
16829 else
16830 {
16831 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16832
16833 RTFLOAT32U r32Src1, r32Src2;
16834 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
16835 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
16836
16837 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16838 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16839 if (f32_eq(f32Src1, f32Src2, &SoftState))
16840 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16841 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16842 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16843 /* else: GREATER_THAN 000 */
16844
16845 *pfMxcsr |= fDe;
16846 }
16847
16848 *pfEFlags = fEFlagsNew;
16849}
16850#endif
16851
16852
16853IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16854{
16855 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16856}
16857
16858
16859/**
16860 * [V]COMISD
16861 */
16862#ifdef IEM_WITHOUT_ASSEMBLY
16863IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16864{
16865 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
16866
16867 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
16868 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
16869 {
16870 *pfMxcsr |= X86_MXCSR_IE;
16871 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
16872 }
16873 else
16874 {
16875 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16876
16877 RTFLOAT64U r64Src1, r64Src2;
16878 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
16879 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
16880
16881 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
16882 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
16883 if (f64_eq(f64Src1, f64Src2, &SoftState))
16884 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
16885 else if (f64_lt(f64Src1, f64Src2, &SoftState))
16886 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
16887 /* else: GREATER_THAN 000 */
16888
16889 *pfMxcsr |= fDe;
16890 }
16891
16892 *pfEFlags = fEFlagsNew;
16893}
16894#endif
16895
16896IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16897{
16898 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
16899}
16900
16901
16902/**
16903 * CMPPS / CMPPD / CMPSS / CMPSD
16904 */
16905#ifdef IEM_WITHOUT_ASSEMBLY
16906/**
16907 * A compare truth table entry.
16908 */
16909typedef struct CMPTRUTHTBLENTRY
16910{
16911 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
16912 bool fSignalsOnQNan;
16913 /** The boolean result when the input operands are unordered. */
16914 bool fUnordered;
16915 /** The boolean result when A = B. */
16916 bool fEqual;
16917 /** The boolean result when A < B. */
16918 bool fLowerThan;
16919 /** The boolean result when A > B. */
16920 bool fGreaterThan;
16921} CMPTRUTHTBLENTRY;
16922/** Pointer to a const truth table entry. */
16923typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
16924
16925
16926/** The compare truth table (indexed by immediate). */
16927static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
16928{
16929 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
16930 /* 00H (EQ_OQ) */ { false, false, true, false, false },
16931 /* 01H (LT_OS) */ { true, false, false, true, false },
16932 /* 02H (LE_OS) */ { true, false, true, true, false },
16933 /* 03H (UNORD_Q) */ { false, true, false, false, false },
16934 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
16935 /* 05H (NLT_US) */ { true, true, true, false, true },
16936 /* 06H (NLE_US) */ { true, true, false, false, true },
16937 /* 07H (ORQ_Q) */ { false, false, true, true, true },
16938 /** @todo AVX variants. */
16939};
16940
16941
16942static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
16943{
16944 bool fRes;
16945 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
16946
16947 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
16948 {
16949 *pfMxcsr |= X86_MXCSR_IE;
16950 fRes = g_aCmpTbl[bEvil].fUnordered;
16951 }
16952 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
16953 {
16954 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
16955 *pfMxcsr |= X86_MXCSR_IE;
16956 fRes = g_aCmpTbl[bEvil].fUnordered;
16957 }
16958 else
16959 {
16960 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
16961
16962 RTFLOAT32U r32Src1, r32Src2;
16963 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
16964 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
16965
16966 *pfMxcsr |= fDe;
16967 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
16968 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
16969 if (f32_eq(f32Src1, f32Src2, &SoftState))
16970 fRes = g_aCmpTbl[bEvil].fEqual;
16971 else if (f32_lt(f32Src1, f32Src2, &SoftState))
16972 fRes = g_aCmpTbl[bEvil].fLowerThan;
16973 else
16974 fRes = g_aCmpTbl[bEvil].fGreaterThan;
16975 }
16976
16977 return fRes;
16978}
16979
16980
16981static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
16982{
16983 bool fRes;
16984 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
16985
16986 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
16987 {
16988 *pfMxcsr |= X86_MXCSR_IE;
16989 fRes = g_aCmpTbl[bEvil].fUnordered;
16990 }
16991 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
16992 {
16993 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
16994 *pfMxcsr |= X86_MXCSR_IE;
16995 fRes = g_aCmpTbl[bEvil].fUnordered;
16996 }
16997 else
16998 {
16999 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17000
17001 RTFLOAT64U r64Src1, r64Src2;
17002 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1);
17003 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
17004
17005 *pfMxcsr |= fDe;
17006 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17007 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17008 if (f64_eq(f64Src1, f64Src2, &SoftState))
17009 fRes = g_aCmpTbl[bEvil].fEqual;
17010 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17011 fRes = g_aCmpTbl[bEvil].fLowerThan;
17012 else
17013 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17014 }
17015
17016 return fRes;
17017}
17018
17019
17020IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17021{
17022 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17023 {
17024 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
17025 puDst->au32[i] = UINT32_MAX;
17026 else
17027 puDst->au32[i] = 0;
17028 }
17029}
17030
17031
17032IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17033{
17034 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17035 {
17036 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
17037 puDst->au64[i] = UINT64_MAX;
17038 else
17039 puDst->au64[i] = 0;
17040 }
17041}
17042
17043
17044IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17045{
17046 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
17047 puDst->au32[0] = UINT32_MAX;
17048 else
17049 puDst->au32[0] = 0;
17050
17051 puDst->au32[1] = pSrc->uSrc1.au32[1];
17052 puDst->au64[1] = pSrc->uSrc1.au64[1];
17053}
17054
17055
17056IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17057{
17058 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
17059 puDst->au64[0] = UINT64_MAX;
17060 else
17061 puDst->au64[0] = 0;
17062
17063 puDst->au64[1] = pSrc->uSrc1.au64[1];
17064}
17065#endif
17066
17067
17068/**
17069 * CVTPD2PI
17070 */
17071#ifdef IEM_WITHOUT_ASSEMBLY
17072static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17073{
17074 RTFLOAT64U r64Src;
17075 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17076
17077 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17078 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17079 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17080}
17081
17082
17083IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17084{
17085 RTUINT64U u64Res;
17086 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17087 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17088
17089 *pu64Dst = u64Res.u;
17090 *pfMxcsr = fMxcsrOut;
17091}
17092#endif
17093
17094
17095/**
17096 * CVTTPD2PI
17097 */
17098#ifdef IEM_WITHOUT_ASSEMBLY
17099static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17100{
17101 RTFLOAT64U r64Src;
17102 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17103
17104 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17105 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17106 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17107}
17108
17109
17110IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17111{
17112 RTUINT64U u64Res;
17113 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17114 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17115
17116 *pu64Dst = u64Res.u;
17117 *pfMxcsr = fMxcsrOut;
17118}
17119#endif
17120
17121
17122/**
17123 * CVTPI2PS
17124 */
17125#ifdef IEM_WITHOUT_ASSEMBLY
17126static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
17127{
17128 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17129 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
17130 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
17131}
17132
17133
17134IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17135{
17136 RTUINT64U uSrc = { u64Src };
17137 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
17138 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
17139 *pfMxcsr = fMxcsrOut;
17140}
17141#endif
17142
17143
17144/**
17145 * CVTPI2PD
17146 */
17147#ifdef IEM_WITHOUT_ASSEMBLY
17148static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
17149{
17150 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17151 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
17152 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
17153}
17154
17155
17156IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17157{
17158 RTUINT64U uSrc = { u64Src };
17159 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
17160 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
17161 *pfMxcsr = fMxcsrOut;
17162}
17163#endif
17164
17165
17166/**
17167 * CVTPS2PI
17168 */
17169#ifdef IEM_WITHOUT_ASSEMBLY
17170static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17171{
17172 RTFLOAT32U r32Src;
17173 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17174
17175 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17176 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17177 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17178}
17179
17180
17181IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17182{
17183 RTUINT64U uDst;
17184 RTUINT64U uSrc = { u64Src };
17185 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17186 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17187 *pu64Dst = uDst.u;
17188 *pfMxcsr = fMxcsrOut;
17189}
17190#endif
17191
17192
17193/**
17194 * CVTTPS2PI
17195 */
17196#ifdef IEM_WITHOUT_ASSEMBLY
17197static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17198{
17199 RTFLOAT32U r32Src;
17200 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17201
17202 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17203 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17204 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17205}
17206
17207
17208IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17209{
17210 RTUINT64U uDst;
17211 RTUINT64U uSrc = { u64Src };
17212 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17213 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17214 *pu64Dst = uDst.u;
17215 *pfMxcsr = fMxcsrOut;
17216}
17217#endif
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette