VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImplC.cpp@ 100599

最後變更 在這個檔案從100599是 100599,由 vboxsync 提交於 20 月 前

VMM/IEM: Implement vpaddusb/vpaddusw instruction emulations, bugref:9898

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 695.7 KB
 
1/* $Id: IEMAllAImplC.cpp 100599 2023-07-17 11:51:24Z vboxsync $ */
2/** @file
3 * IEM - Instruction Implementation in Assembly, portable C variant.
4 */
5
6/*
7 * Copyright (C) 2011-2023 Oracle and/or its affiliates.
8 *
9 * This file is part of VirtualBox base platform packages, as
10 * available from https://www.alldomusa.eu.org.
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation, in version 3 of the
15 * License.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, see <https://www.gnu.org/licenses>.
24 *
25 * SPDX-License-Identifier: GPL-3.0-only
26 */
27
28
29/*********************************************************************************************************************************
30* Header Files *
31*********************************************************************************************************************************/
32#include "IEMInternal.h"
33#include <VBox/vmm/vmcc.h>
34#include <iprt/errcore.h>
35#include <iprt/x86.h>
36#include <iprt/uint128.h>
37#include <iprt/uint256.h>
38#include <iprt/crc.h>
39
40RT_C_DECLS_BEGIN
41#include <softfloat.h>
42RT_C_DECLS_END
43
44
45/*********************************************************************************************************************************
46* Defined Constants And Macros *
47*********************************************************************************************************************************/
48/** @def IEM_WITHOUT_ASSEMBLY
49 * Enables all the code in this file.
50 */
51#if !defined(IEM_WITHOUT_ASSEMBLY)
52# if defined(RT_ARCH_ARM32) || defined(RT_ARCH_ARM64) || defined(DOXYGEN_RUNNING)
53# define IEM_WITHOUT_ASSEMBLY
54# endif
55#endif
56/* IEM_WITH_ASSEMBLY trumps IEM_WITHOUT_ASSEMBLY for tstIEMAImplAsm purposes. */
57#ifdef IEM_WITH_ASSEMBLY
58# undef IEM_WITHOUT_ASSEMBLY
59#endif
60
61/**
62 * Calculates the signed flag value given a result and it's bit width.
63 *
64 * The signed flag (SF) is a duplication of the most significant bit in the
65 * result.
66 *
67 * @returns X86_EFL_SF or 0.
68 * @param a_uResult Unsigned result value.
69 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
70 */
71#define X86_EFL_CALC_SF(a_uResult, a_cBitsWidth) \
72 ( (uint32_t)((a_uResult) >> ((a_cBitsWidth) - X86_EFL_SF_BIT - 1)) & X86_EFL_SF )
73
74/**
75 * Calculates the zero flag value given a result.
76 *
77 * The zero flag (ZF) indicates whether the result is zero or not.
78 *
79 * @returns X86_EFL_ZF or 0.
80 * @param a_uResult Unsigned result value.
81 */
82#define X86_EFL_CALC_ZF(a_uResult) \
83 ( (uint32_t)((a_uResult) == 0) << X86_EFL_ZF_BIT )
84
85/**
86 * Extracts the OF flag from a OF calculation result.
87 *
88 * These are typically used by concating with a bitcount. The problem is that
89 * 8-bit values needs shifting in the other direction than the others.
90 */
91#define X86_EFL_GET_OF_8(a_uValue) (((uint32_t)(a_uValue) << (X86_EFL_OF_BIT - 8 + 1)) & X86_EFL_OF)
92#define X86_EFL_GET_OF_16(a_uValue) ((uint32_t)((a_uValue) >> (16 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
93#define X86_EFL_GET_OF_32(a_uValue) ((uint32_t)((a_uValue) >> (32 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
94#define X86_EFL_GET_OF_64(a_uValue) ((uint32_t)((a_uValue) >> (64 - X86_EFL_OF_BIT - 1)) & X86_EFL_OF)
95
96/**
97 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after arithmetic op.
98 *
99 * @returns Status bits.
100 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
101 * @param a_uResult Unsigned result value.
102 * @param a_uSrc The source value (for AF calc).
103 * @param a_uDst The original destination value (for AF calc).
104 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
105 * @param a_CfExpr Bool expression for the carry flag (CF).
106 * @param a_uSrcOf The a_uSrc value to use for overflow calculation.
107 */
108#define IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(a_pfEFlags, a_uResult, a_uDst, a_uSrc, a_cBitsWidth, a_CfExpr, a_uSrcOf) \
109 do { \
110 uint32_t fEflTmp = *(a_pfEFlags); \
111 fEflTmp &= ~X86_EFL_STATUS_BITS; \
112 fEflTmp |= (a_CfExpr) << X86_EFL_CF_BIT; \
113 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
114 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uSrc) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
115 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
116 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
117 \
118 /* Overflow during ADDition happens when both inputs have the same signed \
119 bit value and the result has a different sign bit value. \
120 \
121 Since subtraction can be rewritten as addition: 2 - 1 == 2 + -1, it \
122 follows that for SUBtraction the signed bit value must differ between \
123 the two inputs and the result's signed bit diff from the first input. \
124 Note! Must xor with sign bit to convert, not do (0 - a_uSrc). \
125 \
126 See also: http://teaching.idallen.com/dat2343/10f/notes/040_overflow.txt */ \
127 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth( ( ((uint ## a_cBitsWidth ## _t)~((a_uDst) ^ (a_uSrcOf))) \
128 & RT_BIT_64(a_cBitsWidth - 1)) \
129 & ((a_uResult) ^ (a_uDst)) ); \
130 *(a_pfEFlags) = fEflTmp; \
131 } while (0)
132
133/**
134 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) after a logical op.
135 *
136 * CF and OF are defined to be 0 by logical operations. AF on the other hand is
137 * undefined. We do not set AF, as that seems to make the most sense (which
138 * probably makes it the most wrong in real life).
139 *
140 * @returns Status bits.
141 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
142 * @param a_uResult Unsigned result value.
143 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
144 * @param a_fExtra Additional bits to set.
145 */
146#define IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(a_pfEFlags, a_uResult, a_cBitsWidth, a_fExtra) \
147 do { \
148 uint32_t fEflTmp = *(a_pfEFlags); \
149 fEflTmp &= ~X86_EFL_STATUS_BITS; \
150 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
151 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
152 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
153 fEflTmp |= (a_fExtra); \
154 *(a_pfEFlags) = fEflTmp; \
155 } while (0)
156
157
158/*********************************************************************************************************************************
159* Global Variables *
160*********************************************************************************************************************************/
161/**
162 * Parity calculation table.
163 *
164 * This is also used by iemAllAImpl.asm.
165 *
166 * The generator code:
167 * @code
168 * #include <stdio.h>
169 *
170 * int main()
171 * {
172 * unsigned b;
173 * for (b = 0; b < 256; b++)
174 * {
175 * int cOnes = ( b & 1)
176 * + ((b >> 1) & 1)
177 * + ((b >> 2) & 1)
178 * + ((b >> 3) & 1)
179 * + ((b >> 4) & 1)
180 * + ((b >> 5) & 1)
181 * + ((b >> 6) & 1)
182 * + ((b >> 7) & 1);
183 * printf(" /" "* %#04x = %u%u%u%u%u%u%u%ub *" "/ %s,\n",
184 * b,
185 * (b >> 7) & 1,
186 * (b >> 6) & 1,
187 * (b >> 5) & 1,
188 * (b >> 4) & 1,
189 * (b >> 3) & 1,
190 * (b >> 2) & 1,
191 * (b >> 1) & 1,
192 * b & 1,
193 * cOnes & 1 ? "0" : "X86_EFL_PF");
194 * }
195 * return 0;
196 * }
197 * @endcode
198 */
199uint8_t const g_afParity[256] =
200{
201 /* 0000 = 00000000b */ X86_EFL_PF,
202 /* 0x01 = 00000001b */ 0,
203 /* 0x02 = 00000010b */ 0,
204 /* 0x03 = 00000011b */ X86_EFL_PF,
205 /* 0x04 = 00000100b */ 0,
206 /* 0x05 = 00000101b */ X86_EFL_PF,
207 /* 0x06 = 00000110b */ X86_EFL_PF,
208 /* 0x07 = 00000111b */ 0,
209 /* 0x08 = 00001000b */ 0,
210 /* 0x09 = 00001001b */ X86_EFL_PF,
211 /* 0x0a = 00001010b */ X86_EFL_PF,
212 /* 0x0b = 00001011b */ 0,
213 /* 0x0c = 00001100b */ X86_EFL_PF,
214 /* 0x0d = 00001101b */ 0,
215 /* 0x0e = 00001110b */ 0,
216 /* 0x0f = 00001111b */ X86_EFL_PF,
217 /* 0x10 = 00010000b */ 0,
218 /* 0x11 = 00010001b */ X86_EFL_PF,
219 /* 0x12 = 00010010b */ X86_EFL_PF,
220 /* 0x13 = 00010011b */ 0,
221 /* 0x14 = 00010100b */ X86_EFL_PF,
222 /* 0x15 = 00010101b */ 0,
223 /* 0x16 = 00010110b */ 0,
224 /* 0x17 = 00010111b */ X86_EFL_PF,
225 /* 0x18 = 00011000b */ X86_EFL_PF,
226 /* 0x19 = 00011001b */ 0,
227 /* 0x1a = 00011010b */ 0,
228 /* 0x1b = 00011011b */ X86_EFL_PF,
229 /* 0x1c = 00011100b */ 0,
230 /* 0x1d = 00011101b */ X86_EFL_PF,
231 /* 0x1e = 00011110b */ X86_EFL_PF,
232 /* 0x1f = 00011111b */ 0,
233 /* 0x20 = 00100000b */ 0,
234 /* 0x21 = 00100001b */ X86_EFL_PF,
235 /* 0x22 = 00100010b */ X86_EFL_PF,
236 /* 0x23 = 00100011b */ 0,
237 /* 0x24 = 00100100b */ X86_EFL_PF,
238 /* 0x25 = 00100101b */ 0,
239 /* 0x26 = 00100110b */ 0,
240 /* 0x27 = 00100111b */ X86_EFL_PF,
241 /* 0x28 = 00101000b */ X86_EFL_PF,
242 /* 0x29 = 00101001b */ 0,
243 /* 0x2a = 00101010b */ 0,
244 /* 0x2b = 00101011b */ X86_EFL_PF,
245 /* 0x2c = 00101100b */ 0,
246 /* 0x2d = 00101101b */ X86_EFL_PF,
247 /* 0x2e = 00101110b */ X86_EFL_PF,
248 /* 0x2f = 00101111b */ 0,
249 /* 0x30 = 00110000b */ X86_EFL_PF,
250 /* 0x31 = 00110001b */ 0,
251 /* 0x32 = 00110010b */ 0,
252 /* 0x33 = 00110011b */ X86_EFL_PF,
253 /* 0x34 = 00110100b */ 0,
254 /* 0x35 = 00110101b */ X86_EFL_PF,
255 /* 0x36 = 00110110b */ X86_EFL_PF,
256 /* 0x37 = 00110111b */ 0,
257 /* 0x38 = 00111000b */ 0,
258 /* 0x39 = 00111001b */ X86_EFL_PF,
259 /* 0x3a = 00111010b */ X86_EFL_PF,
260 /* 0x3b = 00111011b */ 0,
261 /* 0x3c = 00111100b */ X86_EFL_PF,
262 /* 0x3d = 00111101b */ 0,
263 /* 0x3e = 00111110b */ 0,
264 /* 0x3f = 00111111b */ X86_EFL_PF,
265 /* 0x40 = 01000000b */ 0,
266 /* 0x41 = 01000001b */ X86_EFL_PF,
267 /* 0x42 = 01000010b */ X86_EFL_PF,
268 /* 0x43 = 01000011b */ 0,
269 /* 0x44 = 01000100b */ X86_EFL_PF,
270 /* 0x45 = 01000101b */ 0,
271 /* 0x46 = 01000110b */ 0,
272 /* 0x47 = 01000111b */ X86_EFL_PF,
273 /* 0x48 = 01001000b */ X86_EFL_PF,
274 /* 0x49 = 01001001b */ 0,
275 /* 0x4a = 01001010b */ 0,
276 /* 0x4b = 01001011b */ X86_EFL_PF,
277 /* 0x4c = 01001100b */ 0,
278 /* 0x4d = 01001101b */ X86_EFL_PF,
279 /* 0x4e = 01001110b */ X86_EFL_PF,
280 /* 0x4f = 01001111b */ 0,
281 /* 0x50 = 01010000b */ X86_EFL_PF,
282 /* 0x51 = 01010001b */ 0,
283 /* 0x52 = 01010010b */ 0,
284 /* 0x53 = 01010011b */ X86_EFL_PF,
285 /* 0x54 = 01010100b */ 0,
286 /* 0x55 = 01010101b */ X86_EFL_PF,
287 /* 0x56 = 01010110b */ X86_EFL_PF,
288 /* 0x57 = 01010111b */ 0,
289 /* 0x58 = 01011000b */ 0,
290 /* 0x59 = 01011001b */ X86_EFL_PF,
291 /* 0x5a = 01011010b */ X86_EFL_PF,
292 /* 0x5b = 01011011b */ 0,
293 /* 0x5c = 01011100b */ X86_EFL_PF,
294 /* 0x5d = 01011101b */ 0,
295 /* 0x5e = 01011110b */ 0,
296 /* 0x5f = 01011111b */ X86_EFL_PF,
297 /* 0x60 = 01100000b */ X86_EFL_PF,
298 /* 0x61 = 01100001b */ 0,
299 /* 0x62 = 01100010b */ 0,
300 /* 0x63 = 01100011b */ X86_EFL_PF,
301 /* 0x64 = 01100100b */ 0,
302 /* 0x65 = 01100101b */ X86_EFL_PF,
303 /* 0x66 = 01100110b */ X86_EFL_PF,
304 /* 0x67 = 01100111b */ 0,
305 /* 0x68 = 01101000b */ 0,
306 /* 0x69 = 01101001b */ X86_EFL_PF,
307 /* 0x6a = 01101010b */ X86_EFL_PF,
308 /* 0x6b = 01101011b */ 0,
309 /* 0x6c = 01101100b */ X86_EFL_PF,
310 /* 0x6d = 01101101b */ 0,
311 /* 0x6e = 01101110b */ 0,
312 /* 0x6f = 01101111b */ X86_EFL_PF,
313 /* 0x70 = 01110000b */ 0,
314 /* 0x71 = 01110001b */ X86_EFL_PF,
315 /* 0x72 = 01110010b */ X86_EFL_PF,
316 /* 0x73 = 01110011b */ 0,
317 /* 0x74 = 01110100b */ X86_EFL_PF,
318 /* 0x75 = 01110101b */ 0,
319 /* 0x76 = 01110110b */ 0,
320 /* 0x77 = 01110111b */ X86_EFL_PF,
321 /* 0x78 = 01111000b */ X86_EFL_PF,
322 /* 0x79 = 01111001b */ 0,
323 /* 0x7a = 01111010b */ 0,
324 /* 0x7b = 01111011b */ X86_EFL_PF,
325 /* 0x7c = 01111100b */ 0,
326 /* 0x7d = 01111101b */ X86_EFL_PF,
327 /* 0x7e = 01111110b */ X86_EFL_PF,
328 /* 0x7f = 01111111b */ 0,
329 /* 0x80 = 10000000b */ 0,
330 /* 0x81 = 10000001b */ X86_EFL_PF,
331 /* 0x82 = 10000010b */ X86_EFL_PF,
332 /* 0x83 = 10000011b */ 0,
333 /* 0x84 = 10000100b */ X86_EFL_PF,
334 /* 0x85 = 10000101b */ 0,
335 /* 0x86 = 10000110b */ 0,
336 /* 0x87 = 10000111b */ X86_EFL_PF,
337 /* 0x88 = 10001000b */ X86_EFL_PF,
338 /* 0x89 = 10001001b */ 0,
339 /* 0x8a = 10001010b */ 0,
340 /* 0x8b = 10001011b */ X86_EFL_PF,
341 /* 0x8c = 10001100b */ 0,
342 /* 0x8d = 10001101b */ X86_EFL_PF,
343 /* 0x8e = 10001110b */ X86_EFL_PF,
344 /* 0x8f = 10001111b */ 0,
345 /* 0x90 = 10010000b */ X86_EFL_PF,
346 /* 0x91 = 10010001b */ 0,
347 /* 0x92 = 10010010b */ 0,
348 /* 0x93 = 10010011b */ X86_EFL_PF,
349 /* 0x94 = 10010100b */ 0,
350 /* 0x95 = 10010101b */ X86_EFL_PF,
351 /* 0x96 = 10010110b */ X86_EFL_PF,
352 /* 0x97 = 10010111b */ 0,
353 /* 0x98 = 10011000b */ 0,
354 /* 0x99 = 10011001b */ X86_EFL_PF,
355 /* 0x9a = 10011010b */ X86_EFL_PF,
356 /* 0x9b = 10011011b */ 0,
357 /* 0x9c = 10011100b */ X86_EFL_PF,
358 /* 0x9d = 10011101b */ 0,
359 /* 0x9e = 10011110b */ 0,
360 /* 0x9f = 10011111b */ X86_EFL_PF,
361 /* 0xa0 = 10100000b */ X86_EFL_PF,
362 /* 0xa1 = 10100001b */ 0,
363 /* 0xa2 = 10100010b */ 0,
364 /* 0xa3 = 10100011b */ X86_EFL_PF,
365 /* 0xa4 = 10100100b */ 0,
366 /* 0xa5 = 10100101b */ X86_EFL_PF,
367 /* 0xa6 = 10100110b */ X86_EFL_PF,
368 /* 0xa7 = 10100111b */ 0,
369 /* 0xa8 = 10101000b */ 0,
370 /* 0xa9 = 10101001b */ X86_EFL_PF,
371 /* 0xaa = 10101010b */ X86_EFL_PF,
372 /* 0xab = 10101011b */ 0,
373 /* 0xac = 10101100b */ X86_EFL_PF,
374 /* 0xad = 10101101b */ 0,
375 /* 0xae = 10101110b */ 0,
376 /* 0xaf = 10101111b */ X86_EFL_PF,
377 /* 0xb0 = 10110000b */ 0,
378 /* 0xb1 = 10110001b */ X86_EFL_PF,
379 /* 0xb2 = 10110010b */ X86_EFL_PF,
380 /* 0xb3 = 10110011b */ 0,
381 /* 0xb4 = 10110100b */ X86_EFL_PF,
382 /* 0xb5 = 10110101b */ 0,
383 /* 0xb6 = 10110110b */ 0,
384 /* 0xb7 = 10110111b */ X86_EFL_PF,
385 /* 0xb8 = 10111000b */ X86_EFL_PF,
386 /* 0xb9 = 10111001b */ 0,
387 /* 0xba = 10111010b */ 0,
388 /* 0xbb = 10111011b */ X86_EFL_PF,
389 /* 0xbc = 10111100b */ 0,
390 /* 0xbd = 10111101b */ X86_EFL_PF,
391 /* 0xbe = 10111110b */ X86_EFL_PF,
392 /* 0xbf = 10111111b */ 0,
393 /* 0xc0 = 11000000b */ X86_EFL_PF,
394 /* 0xc1 = 11000001b */ 0,
395 /* 0xc2 = 11000010b */ 0,
396 /* 0xc3 = 11000011b */ X86_EFL_PF,
397 /* 0xc4 = 11000100b */ 0,
398 /* 0xc5 = 11000101b */ X86_EFL_PF,
399 /* 0xc6 = 11000110b */ X86_EFL_PF,
400 /* 0xc7 = 11000111b */ 0,
401 /* 0xc8 = 11001000b */ 0,
402 /* 0xc9 = 11001001b */ X86_EFL_PF,
403 /* 0xca = 11001010b */ X86_EFL_PF,
404 /* 0xcb = 11001011b */ 0,
405 /* 0xcc = 11001100b */ X86_EFL_PF,
406 /* 0xcd = 11001101b */ 0,
407 /* 0xce = 11001110b */ 0,
408 /* 0xcf = 11001111b */ X86_EFL_PF,
409 /* 0xd0 = 11010000b */ 0,
410 /* 0xd1 = 11010001b */ X86_EFL_PF,
411 /* 0xd2 = 11010010b */ X86_EFL_PF,
412 /* 0xd3 = 11010011b */ 0,
413 /* 0xd4 = 11010100b */ X86_EFL_PF,
414 /* 0xd5 = 11010101b */ 0,
415 /* 0xd6 = 11010110b */ 0,
416 /* 0xd7 = 11010111b */ X86_EFL_PF,
417 /* 0xd8 = 11011000b */ X86_EFL_PF,
418 /* 0xd9 = 11011001b */ 0,
419 /* 0xda = 11011010b */ 0,
420 /* 0xdb = 11011011b */ X86_EFL_PF,
421 /* 0xdc = 11011100b */ 0,
422 /* 0xdd = 11011101b */ X86_EFL_PF,
423 /* 0xde = 11011110b */ X86_EFL_PF,
424 /* 0xdf = 11011111b */ 0,
425 /* 0xe0 = 11100000b */ 0,
426 /* 0xe1 = 11100001b */ X86_EFL_PF,
427 /* 0xe2 = 11100010b */ X86_EFL_PF,
428 /* 0xe3 = 11100011b */ 0,
429 /* 0xe4 = 11100100b */ X86_EFL_PF,
430 /* 0xe5 = 11100101b */ 0,
431 /* 0xe6 = 11100110b */ 0,
432 /* 0xe7 = 11100111b */ X86_EFL_PF,
433 /* 0xe8 = 11101000b */ X86_EFL_PF,
434 /* 0xe9 = 11101001b */ 0,
435 /* 0xea = 11101010b */ 0,
436 /* 0xeb = 11101011b */ X86_EFL_PF,
437 /* 0xec = 11101100b */ 0,
438 /* 0xed = 11101101b */ X86_EFL_PF,
439 /* 0xee = 11101110b */ X86_EFL_PF,
440 /* 0xef = 11101111b */ 0,
441 /* 0xf0 = 11110000b */ X86_EFL_PF,
442 /* 0xf1 = 11110001b */ 0,
443 /* 0xf2 = 11110010b */ 0,
444 /* 0xf3 = 11110011b */ X86_EFL_PF,
445 /* 0xf4 = 11110100b */ 0,
446 /* 0xf5 = 11110101b */ X86_EFL_PF,
447 /* 0xf6 = 11110110b */ X86_EFL_PF,
448 /* 0xf7 = 11110111b */ 0,
449 /* 0xf8 = 11111000b */ 0,
450 /* 0xf9 = 11111001b */ X86_EFL_PF,
451 /* 0xfa = 11111010b */ X86_EFL_PF,
452 /* 0xfb = 11111011b */ 0,
453 /* 0xfc = 11111100b */ X86_EFL_PF,
454 /* 0xfd = 11111101b */ 0,
455 /* 0xfe = 11111110b */ 0,
456 /* 0xff = 11111111b */ X86_EFL_PF,
457};
458
459/* for clang: */
460extern const RTFLOAT32U g_ar32Zero[];
461extern const RTFLOAT64U g_ar64Zero[];
462extern const RTFLOAT80U g_ar80Zero[];
463extern const RTFLOAT80U g_ar80One[];
464extern const RTFLOAT80U g_r80Indefinite;
465extern const RTFLOAT32U g_ar32Infinity[];
466extern const RTFLOAT64U g_ar64Infinity[];
467extern const RTFLOAT80U g_ar80Infinity[];
468extern const RTFLOAT128U g_r128Ln2;
469extern const RTUINT128U g_u128Ln2Mantissa;
470extern const RTUINT128U g_u128Ln2MantissaIntel;
471extern const RTFLOAT128U g_ar128F2xm1HornerConsts[];
472extern const RTFLOAT32U g_ar32QNaN[];
473extern const RTFLOAT64U g_ar64QNaN[];
474
475/** Zero values (indexed by fSign). */
476RTFLOAT32U const g_ar32Zero[] = { RTFLOAT32U_INIT_ZERO(0), RTFLOAT32U_INIT_ZERO(1) };
477RTFLOAT64U const g_ar64Zero[] = { RTFLOAT64U_INIT_ZERO(0), RTFLOAT64U_INIT_ZERO(1) };
478RTFLOAT80U const g_ar80Zero[] = { RTFLOAT80U_INIT_ZERO(0), RTFLOAT80U_INIT_ZERO(1) };
479
480/** One values (indexed by fSign). */
481RTFLOAT80U const g_ar80One[] =
482{ RTFLOAT80U_INIT(0, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS), RTFLOAT80U_INIT(1, RT_BIT_64(63), RTFLOAT80U_EXP_BIAS) };
483
484/** Indefinite (negative). */
485RTFLOAT80U const g_r80Indefinite = RTFLOAT80U_INIT_INDEFINITE(1);
486
487/** Infinities (indexed by fSign). */
488RTFLOAT32U const g_ar32Infinity[] = { RTFLOAT32U_INIT_INF(0), RTFLOAT32U_INIT_INF(1) };
489RTFLOAT64U const g_ar64Infinity[] = { RTFLOAT64U_INIT_INF(0), RTFLOAT64U_INIT_INF(1) };
490RTFLOAT80U const g_ar80Infinity[] = { RTFLOAT80U_INIT_INF(0), RTFLOAT80U_INIT_INF(1) };
491
492/** Default QNaNs (indexed by fSign). */
493RTFLOAT32U const g_ar32QNaN[] = { RTFLOAT32U_INIT_QNAN(0), RTFLOAT32U_INIT_QNAN(1) };
494RTFLOAT64U const g_ar64QNaN[] = { RTFLOAT64U_INIT_QNAN(0), RTFLOAT64U_INIT_QNAN(1) };
495
496
497#if 0
498/** 128-bit floating point constant: 2.0 */
499const RTFLOAT128U g_r128Two = RTFLOAT128U_INIT_C(0, 0, 0, RTFLOAT128U_EXP_BIAS + 1);
500#endif
501
502
503/* The next section is generated by tools/IEMGenFpuConstants: */
504
505/** The ln2 constant as 128-bit floating point value.
506 * base-10: 6.93147180559945309417232121458176575e-1
507 * base-16: b.17217f7d1cf79abc9e3b39803f30@-1
508 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100110e-1 */
509//const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf35793c7673007e6, 0x3ffe);
510const RTFLOAT128U g_r128Ln2 = RTFLOAT128U_INIT_C(0, 0x62e42fefa39e, 0xf357900000000000, 0x3ffe);
511/** High precision ln2 value.
512 * base-10: 6.931471805599453094172321214581765680747e-1
513 * base-16: b.17217f7d1cf79abc9e3b39803f2f6af0@-1
514 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100100111100011101100111001100000000011111100101111011010101111e-1 */
515const RTUINT128U g_u128Ln2Mantissa = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc9e3b39803f2f6af);
516/** High precision ln2 value, compatible with f2xm1 results on intel 10980XE.
517 * base-10: 6.931471805599453094151379470289064954613e-1
518 * base-16: b.17217f7d1cf79abc0000000000000000@-1
519 * base-2 : 1.0110001011100100001011111110111110100011100111101111001101010111100000000000000000000000000000000000000000000000000000000000000e-1 */
520const RTUINT128U g_u128Ln2MantissaIntel = RTUINT128_INIT_C(0xb17217f7d1cf79ab, 0xc000000000000000);
521
522/** Horner constants for f2xm1 */
523const RTFLOAT128U g_ar128F2xm1HornerConsts[] =
524{
525 /* a0
526 * base-10: 1.00000000000000000000000000000000000e0
527 * base-16: 1.0000000000000000000000000000@0
528 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e0 */
529 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3fff),
530 /* a1
531 * base-10: 5.00000000000000000000000000000000000e-1
532 * base-16: 8.0000000000000000000000000000@-1
533 * base-2 : 1.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e-1 */
534 RTFLOAT128U_INIT_C(0, 0x000000000000, 0x0000000000000000, 0x3ffe),
535 /* a2
536 * base-10: 1.66666666666666666666666666666666658e-1
537 * base-16: 2.aaaaaaaaaaaaaaaaaaaaaaaaaaaa@-1
538 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-3 */
539 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffc),
540 /* a3
541 * base-10: 4.16666666666666666666666666666666646e-2
542 * base-16: a.aaaaaaaaaaaaaaaaaaaaaaaaaaa8@-2
543 * base-2 : 1.0101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101010101e-5 */
544 RTFLOAT128U_INIT_C(0, 0x555555555555, 0x5555555555555555, 0x3ffa),
545 /* a4
546 * base-10: 8.33333333333333333333333333333333323e-3
547 * base-16: 2.2222222222222222222222222222@-2
548 * base-2 : 1.0001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001000100010001e-7 */
549 RTFLOAT128U_INIT_C(0, 0x111111111111, 0x1111111111111111, 0x3ff8),
550 /* a5
551 * base-10: 1.38888888888888888888888888888888874e-3
552 * base-16: 5.b05b05b05b05b05b05b05b05b058@-3
553 * base-2 : 1.0110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110110000010110e-10 */
554 RTFLOAT128U_INIT_C(0, 0x6c16c16c16c1, 0x6c16c16c16c16c16, 0x3ff5),
555 /* a6
556 * base-10: 1.98412698412698412698412698412698412e-4
557 * base-16: d.00d00d00d00d00d00d00d00d00d0@-4
558 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-13 */
559 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3ff2),
560 /* a7
561 * base-10: 2.48015873015873015873015873015873015e-5
562 * base-16: 1.a01a01a01a01a01a01a01a01a01a@-4
563 * base-2 : 1.1010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010000000011010e-16 */
564 RTFLOAT128U_INIT_C(0, 0xa01a01a01a01, 0xa01a01a01a01a01a, 0x3fef),
565 /* a8
566 * base-10: 2.75573192239858906525573192239858902e-6
567 * base-16: 2.e3bc74aad8e671f5583911ca002e@-5
568 * base-2 : 1.0111000111011110001110100101010101101100011100110011100011111010101011000001110010001000111001010000000000010111e-19 */
569 RTFLOAT128U_INIT_C(0, 0x71de3a556c73, 0x38faac1c88e50017, 0x3fec),
570 /* a9
571 * base-10: 2.75573192239858906525573192239858865e-7
572 * base-16: 4.9f93edde27d71cbbc05b4fa999e0@-6
573 * base-2 : 1.0010011111100100111110110111011110001001111101011100011100101110111100000001011011010011111010100110011001111000e-22 */
574 RTFLOAT128U_INIT_C(0, 0x27e4fb7789f5, 0xc72ef016d3ea6678, 0x3fe9),
575 /* a10
576 * base-10: 2.50521083854417187750521083854417184e-8
577 * base-16: 6.b99159fd5138e3f9d1f92e0df71c@-7
578 * base-2 : 1.1010111001100100010101100111111101010100010011100011100011111110011101000111111001001011100000110111110111000111e-26 */
579 RTFLOAT128U_INIT_C(0, 0xae64567f544e, 0x38fe747e4b837dc7, 0x3fe5),
580 /* a11
581 * base-10: 2.08767569878680989792100903212014296e-9
582 * base-16: 8.f76c77fc6c4bdaa26d4c3d67f420@-8
583 * base-2 : 1.0001111011101101100011101111111110001101100010010111101101010100010011011010100110000111101011001111111010000100e-29 */
584 RTFLOAT128U_INIT_C(0, 0x1eed8eff8d89, 0x7b544da987acfe84, 0x3fe2),
585 /* a12
586 * base-10: 1.60590438368216145993923771701549472e-10
587 * base-16: b.092309d43684be51c198e91d7b40@-9
588 * base-2 : 1.0110000100100100011000010011101010000110110100001001011111001010001110000011001100011101001000111010111101101000e-33 */
589 RTFLOAT128U_INIT_C(0, 0x6124613a86d0, 0x97ca38331d23af68, 0x3fde),
590 /* a13
591 * base-10: 1.14707455977297247138516979786821043e-11
592 * base-16: c.9cba54603e4e905d6f8a2efd1f20@-10
593 * base-2 : 1.1001001110010111010010101000110000000111110010011101001000001011101011011111000101000101110111111010001111100100e-37 */
594 RTFLOAT128U_INIT_C(0, 0x93974a8c07c9, 0xd20badf145dfa3e4, 0x3fda),
595 /* a14
596 * base-10: 7.64716373181981647590113198578806964e-13
597 * base-16: d.73f9f399dc0f88ec32b587746578@-11
598 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-41 */
599 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd6),
600 /* a15
601 * base-10: 4.77947733238738529743820749111754352e-14
602 * base-16: d.73f9f399dc0f88ec32b587746578@-12
603 * base-2 : 1.1010111001111111001111100111001100111011100000011111000100011101100001100101011010110000111011101000110010101111e-45 */
604 RTFLOAT128U_INIT_C(0, 0xae7f3e733b81, 0xf11d8656b0ee8caf, 0x3fd2),
605 /* a16
606 * base-10: 2.81145725434552076319894558301031970e-15
607 * base-16: c.a963b81856a53593028cbbb8d7f8@-13
608 * base-2 : 1.1001010100101100011101110000001100001010110101001010011010110010011000000101000110010111011101110001101011111111e-49 */
609 RTFLOAT128U_INIT_C(0, 0x952c77030ad4, 0xa6b2605197771aff, 0x3fce),
610 /* a17
611 * base-10: 1.56192069685862264622163643500573321e-16
612 * base-16: b.413c31dcbecbbdd8024435161550@-14
613 * base-2 : 1.0110100000100111100001100011101110010111110110010111011110111011000000000100100010000110101000101100001010101010e-53 */
614 RTFLOAT128U_INIT_C(0, 0x6827863b97d9, 0x77bb004886a2c2aa, 0x3fca),
615 /* a18
616 * base-10: 8.22063524662432971695598123687227980e-18
617 * base-16: 9.7a4da340a0ab92650f61dbdcb3a0@-15
618 * base-2 : 1.0010111101001001101101000110100000010100000101010111001001001100101000011110110000111011011110111001011001110100e-57 */
619 RTFLOAT128U_INIT_C(0, 0x2f49b4681415, 0x724ca1ec3b7b9674, 0x3fc6),
620 /* a19
621 * base-10: 4.11031762331216485847799061843614006e-19
622 * base-16: 7.950ae900808941ea72b4afe3c2e8@-16
623 * base-2 : 1.1110010101000010101110100100000000100000001000100101000001111010100111001010110100101011111110001111000010111010e-62 */
624 RTFLOAT128U_INIT_C(0, 0xe542ba402022, 0x507a9cad2bf8f0ba, 0x3fc1),
625 /* a20
626 * base-10: 1.95729410633912612308475743735054143e-20
627 * base-16: 5.c6e3bdb73d5c62fbc51bf3b9b8fc@-17
628 * base-2 : 1.0111000110111000111011110110110111001111010101110001100010111110111100010100011011111100111011100110111000111111e-66 */
629 RTFLOAT128U_INIT_C(0, 0x71b8ef6dcf57, 0x18bef146fcee6e3f, 0x3fbd),
630 /* a21
631 * base-10: 8.89679139245057328674889744250246106e-22
632 * base-16: 4.338e5b6dfe14a5143242dfcce3a0@-18
633 * base-2 : 1.0000110011100011100101101101101101111111100001010010100101000101000011001001000010110111111100110011100011101000e-70 */
634 RTFLOAT128U_INIT_C(0, 0x0ce396db7f85, 0x29450c90b7f338e8, 0x3fb9),
635};
636
637
638/*
639 * There are a few 64-bit on 32-bit things we'd rather do in C. Actually, doing
640 * it all in C is probably safer atm., optimize what's necessary later, maybe.
641 */
642#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
643
644
645/*********************************************************************************************************************************
646* Binary Operations *
647*********************************************************************************************************************************/
648
649/*
650 * ADD
651 */
652
653IEM_DECL_IMPL_DEF(void, iemAImpl_add_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
654{
655 uint64_t uDst = *puDst;
656 uint64_t uResult = uDst + uSrc;
657 *puDst = uResult;
658 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult < uDst, uSrc);
659}
660
661# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
662
663IEM_DECL_IMPL_DEF(void, iemAImpl_add_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
664{
665 uint32_t uDst = *puDst;
666 uint32_t uResult = uDst + uSrc;
667 *puDst = uResult;
668 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult < uDst, uSrc);
669}
670
671
672IEM_DECL_IMPL_DEF(void, iemAImpl_add_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
673{
674 uint16_t uDst = *puDst;
675 uint16_t uResult = uDst + uSrc;
676 *puDst = uResult;
677 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult < uDst, uSrc);
678}
679
680
681IEM_DECL_IMPL_DEF(void, iemAImpl_add_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
682{
683 uint8_t uDst = *puDst;
684 uint8_t uResult = uDst + uSrc;
685 *puDst = uResult;
686 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult < uDst, uSrc);
687}
688
689# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
690
691/*
692 * ADC
693 */
694
695IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
696{
697 if (!(*pfEFlags & X86_EFL_CF))
698 iemAImpl_add_u64(puDst, uSrc, pfEFlags);
699 else
700 {
701 uint64_t uDst = *puDst;
702 uint64_t uResult = uDst + uSrc + 1;
703 *puDst = uResult;
704 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uResult <= uDst, uSrc);
705 }
706}
707
708# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
709
710IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
711{
712 if (!(*pfEFlags & X86_EFL_CF))
713 iemAImpl_add_u32(puDst, uSrc, pfEFlags);
714 else
715 {
716 uint32_t uDst = *puDst;
717 uint32_t uResult = uDst + uSrc + 1;
718 *puDst = uResult;
719 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uResult <= uDst, uSrc);
720 }
721}
722
723
724IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
725{
726 if (!(*pfEFlags & X86_EFL_CF))
727 iemAImpl_add_u16(puDst, uSrc, pfEFlags);
728 else
729 {
730 uint16_t uDst = *puDst;
731 uint16_t uResult = uDst + uSrc + 1;
732 *puDst = uResult;
733 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uResult <= uDst, uSrc);
734 }
735}
736
737
738IEM_DECL_IMPL_DEF(void, iemAImpl_adc_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
739{
740 if (!(*pfEFlags & X86_EFL_CF))
741 iemAImpl_add_u8(puDst, uSrc, pfEFlags);
742 else
743 {
744 uint8_t uDst = *puDst;
745 uint8_t uResult = uDst + uSrc + 1;
746 *puDst = uResult;
747 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uResult <= uDst, uSrc);
748 }
749}
750
751# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
752
753/*
754 * SUB
755 */
756
757IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
758{
759 uint64_t uDst = *puDst;
760 uint64_t uResult = uDst - uSrc;
761 *puDst = uResult;
762 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst < uSrc, uSrc ^ RT_BIT_64(63));
763}
764
765# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
766
767IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
768{
769 uint32_t uDst = *puDst;
770 uint32_t uResult = uDst - uSrc;
771 *puDst = uResult;
772 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst < uSrc, uSrc ^ RT_BIT_32(31));
773}
774
775
776IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
777{
778 uint16_t uDst = *puDst;
779 uint16_t uResult = uDst - uSrc;
780 *puDst = uResult;
781 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst < uSrc, uSrc ^ (uint16_t)0x8000);
782}
783
784
785IEM_DECL_IMPL_DEF(void, iemAImpl_sub_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
786{
787 uint8_t uDst = *puDst;
788 uint8_t uResult = uDst - uSrc;
789 *puDst = uResult;
790 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst < uSrc, uSrc ^ (uint8_t)0x80);
791}
792
793# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
794
795/*
796 * SBB
797 */
798
799IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
800{
801 if (!(*pfEFlags & X86_EFL_CF))
802 iemAImpl_sub_u64(puDst, uSrc, pfEFlags);
803 else
804 {
805 uint64_t uDst = *puDst;
806 uint64_t uResult = uDst - uSrc - 1;
807 *puDst = uResult;
808 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 64, uDst <= uSrc, uSrc ^ RT_BIT_64(63));
809 }
810}
811
812# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
813
814IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
815{
816 if (!(*pfEFlags & X86_EFL_CF))
817 iemAImpl_sub_u32(puDst, uSrc, pfEFlags);
818 else
819 {
820 uint32_t uDst = *puDst;
821 uint32_t uResult = uDst - uSrc - 1;
822 *puDst = uResult;
823 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 32, uDst <= uSrc, uSrc ^ RT_BIT_32(31));
824 }
825}
826
827
828IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
829{
830 if (!(*pfEFlags & X86_EFL_CF))
831 iemAImpl_sub_u16(puDst, uSrc, pfEFlags);
832 else
833 {
834 uint16_t uDst = *puDst;
835 uint16_t uResult = uDst - uSrc - 1;
836 *puDst = uResult;
837 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 16, uDst <= uSrc, uSrc ^ (uint16_t)0x8000);
838 }
839}
840
841
842IEM_DECL_IMPL_DEF(void, iemAImpl_sbb_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
843{
844 if (!(*pfEFlags & X86_EFL_CF))
845 iemAImpl_sub_u8(puDst, uSrc, pfEFlags);
846 else
847 {
848 uint8_t uDst = *puDst;
849 uint8_t uResult = uDst - uSrc - 1;
850 *puDst = uResult;
851 IEM_EFL_UPDATE_STATUS_BITS_FOR_ARITHMETIC(pfEFlags, uResult, uDst, uSrc, 8, uDst <= uSrc, uSrc ^ (uint8_t)0x80);
852 }
853}
854
855# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
856
857
858/*
859 * OR
860 */
861
862IEM_DECL_IMPL_DEF(void, iemAImpl_or_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
863{
864 uint64_t uResult = *puDst | uSrc;
865 *puDst = uResult;
866 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
867}
868
869# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
870
871IEM_DECL_IMPL_DEF(void, iemAImpl_or_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
872{
873 uint32_t uResult = *puDst | uSrc;
874 *puDst = uResult;
875 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
876}
877
878
879IEM_DECL_IMPL_DEF(void, iemAImpl_or_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
880{
881 uint16_t uResult = *puDst | uSrc;
882 *puDst = uResult;
883 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
884}
885
886
887IEM_DECL_IMPL_DEF(void, iemAImpl_or_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
888{
889 uint8_t uResult = *puDst | uSrc;
890 *puDst = uResult;
891 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
892}
893
894# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
895
896/*
897 * XOR
898 */
899
900IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
901{
902 uint64_t uResult = *puDst ^ uSrc;
903 *puDst = uResult;
904 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
905}
906
907# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
908
909IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
910{
911 uint32_t uResult = *puDst ^ uSrc;
912 *puDst = uResult;
913 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
914}
915
916
917IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
918{
919 uint16_t uResult = *puDst ^ uSrc;
920 *puDst = uResult;
921 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
922}
923
924
925IEM_DECL_IMPL_DEF(void, iemAImpl_xor_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
926{
927 uint8_t uResult = *puDst ^ uSrc;
928 *puDst = uResult;
929 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
930}
931
932# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
933
934/*
935 * AND
936 */
937
938IEM_DECL_IMPL_DEF(void, iemAImpl_and_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
939{
940 uint64_t const uResult = *puDst & uSrc;
941 *puDst = uResult;
942 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
943}
944
945# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
946
947IEM_DECL_IMPL_DEF(void, iemAImpl_and_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
948{
949 uint32_t const uResult = *puDst & uSrc;
950 *puDst = uResult;
951 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
952}
953
954
955IEM_DECL_IMPL_DEF(void, iemAImpl_and_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
956{
957 uint16_t const uResult = *puDst & uSrc;
958 *puDst = uResult;
959 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
960}
961
962
963IEM_DECL_IMPL_DEF(void, iemAImpl_and_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
964{
965 uint8_t const uResult = *puDst & uSrc;
966 *puDst = uResult;
967 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
968}
969
970# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
971#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
972
973/*
974 * ANDN (BMI1 instruction)
975 */
976
977IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64_fallback,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
978{
979 uint64_t const uResult = ~uSrc1 & uSrc2;
980 *puDst = uResult;
981 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
982}
983
984
985IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32_fallback,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
986{
987 uint32_t const uResult = ~uSrc1 & uSrc2;
988 *puDst = uResult;
989 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
990}
991
992
993#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
994IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u64,(uint64_t *puDst, uint64_t uSrc1, uint64_t uSrc2, uint32_t *pfEFlags))
995{
996 iemAImpl_andn_u64_fallback(puDst, uSrc1, uSrc2, pfEFlags);
997}
998#endif
999
1000
1001#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1002IEM_DECL_IMPL_DEF(void, iemAImpl_andn_u32,(uint32_t *puDst, uint32_t uSrc1, uint32_t uSrc2, uint32_t *pfEFlags))
1003{
1004 iemAImpl_andn_u32_fallback(puDst, uSrc1, uSrc2, pfEFlags);
1005}
1006#endif
1007
1008#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1009
1010/*
1011 * CMP
1012 */
1013
1014IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1015{
1016 uint64_t uDstTmp = *puDst;
1017 iemAImpl_sub_u64(&uDstTmp, uSrc, pfEFlags);
1018}
1019
1020# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1021
1022IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1023{
1024 uint32_t uDstTmp = *puDst;
1025 iemAImpl_sub_u32(&uDstTmp, uSrc, pfEFlags);
1026}
1027
1028
1029IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1030{
1031 uint16_t uDstTmp = *puDst;
1032 iemAImpl_sub_u16(&uDstTmp, uSrc, pfEFlags);
1033}
1034
1035
1036IEM_DECL_IMPL_DEF(void, iemAImpl_cmp_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1037{
1038 uint8_t uDstTmp = *puDst;
1039 iemAImpl_sub_u8(&uDstTmp, uSrc, pfEFlags);
1040}
1041
1042# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1043
1044/*
1045 * TEST
1046 */
1047
1048IEM_DECL_IMPL_DEF(void, iemAImpl_test_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1049{
1050 uint64_t uResult = *puDst & uSrc;
1051 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 64, 0);
1052}
1053
1054# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1055
1056IEM_DECL_IMPL_DEF(void, iemAImpl_test_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1057{
1058 uint32_t uResult = *puDst & uSrc;
1059 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 32, 0);
1060}
1061
1062
1063IEM_DECL_IMPL_DEF(void, iemAImpl_test_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1064{
1065 uint16_t uResult = *puDst & uSrc;
1066 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 16, 0);
1067}
1068
1069
1070IEM_DECL_IMPL_DEF(void, iemAImpl_test_u8,(uint8_t *puDst, uint8_t uSrc, uint32_t *pfEFlags))
1071{
1072 uint8_t uResult = *puDst & uSrc;
1073 IEM_EFL_UPDATE_STATUS_BITS_FOR_LOGIC(pfEFlags, uResult, 8, 0);
1074}
1075
1076# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1077
1078
1079/*
1080 * LOCK prefixed variants of the above
1081 */
1082
1083/** 64-bit locked binary operand operation. */
1084# define DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1085 do { \
1086 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1087 uint ## a_cBitsWidth ## _t uTmp; \
1088 uint32_t fEflTmp; \
1089 do \
1090 { \
1091 uTmp = uOld; \
1092 fEflTmp = *pfEFlags; \
1093 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, uSrc, &fEflTmp); \
1094 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
1095 *pfEFlags = fEflTmp; \
1096 } while (0)
1097
1098
1099#define EMIT_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth) \
1100 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
1101 uint ## a_cBitsWidth ## _t uSrc, \
1102 uint32_t *pfEFlags)) \
1103 { \
1104 DO_LOCKED_BIN_OP(a_Mnemonic, a_cBitsWidth); \
1105 }
1106
1107EMIT_LOCKED_BIN_OP(add, 64)
1108EMIT_LOCKED_BIN_OP(adc, 64)
1109EMIT_LOCKED_BIN_OP(sub, 64)
1110EMIT_LOCKED_BIN_OP(sbb, 64)
1111EMIT_LOCKED_BIN_OP(or, 64)
1112EMIT_LOCKED_BIN_OP(xor, 64)
1113EMIT_LOCKED_BIN_OP(and, 64)
1114# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1115EMIT_LOCKED_BIN_OP(add, 32)
1116EMIT_LOCKED_BIN_OP(adc, 32)
1117EMIT_LOCKED_BIN_OP(sub, 32)
1118EMIT_LOCKED_BIN_OP(sbb, 32)
1119EMIT_LOCKED_BIN_OP(or, 32)
1120EMIT_LOCKED_BIN_OP(xor, 32)
1121EMIT_LOCKED_BIN_OP(and, 32)
1122
1123EMIT_LOCKED_BIN_OP(add, 16)
1124EMIT_LOCKED_BIN_OP(adc, 16)
1125EMIT_LOCKED_BIN_OP(sub, 16)
1126EMIT_LOCKED_BIN_OP(sbb, 16)
1127EMIT_LOCKED_BIN_OP(or, 16)
1128EMIT_LOCKED_BIN_OP(xor, 16)
1129EMIT_LOCKED_BIN_OP(and, 16)
1130
1131EMIT_LOCKED_BIN_OP(add, 8)
1132EMIT_LOCKED_BIN_OP(adc, 8)
1133EMIT_LOCKED_BIN_OP(sub, 8)
1134EMIT_LOCKED_BIN_OP(sbb, 8)
1135EMIT_LOCKED_BIN_OP(or, 8)
1136EMIT_LOCKED_BIN_OP(xor, 8)
1137EMIT_LOCKED_BIN_OP(and, 8)
1138# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1139
1140
1141/*
1142 * Bit operations (same signature as above).
1143 */
1144
1145/*
1146 * BT
1147 */
1148
1149IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1150{
1151 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1152 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1153 Assert(uSrc < 64);
1154 uint64_t uDst = *puDst;
1155 if (uDst & RT_BIT_64(uSrc))
1156 *pfEFlags |= X86_EFL_CF;
1157 else
1158 *pfEFlags &= ~X86_EFL_CF;
1159}
1160
1161# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1162
1163IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1164{
1165 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1166 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1167 Assert(uSrc < 32);
1168 uint32_t uDst = *puDst;
1169 if (uDst & RT_BIT_32(uSrc))
1170 *pfEFlags |= X86_EFL_CF;
1171 else
1172 *pfEFlags &= ~X86_EFL_CF;
1173}
1174
1175IEM_DECL_IMPL_DEF(void, iemAImpl_bt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1176{
1177 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1178 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1179 Assert(uSrc < 16);
1180 uint16_t uDst = *puDst;
1181 if (uDst & RT_BIT_32(uSrc))
1182 *pfEFlags |= X86_EFL_CF;
1183 else
1184 *pfEFlags &= ~X86_EFL_CF;
1185}
1186
1187# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1188
1189/*
1190 * BTC
1191 */
1192
1193IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1194{
1195 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1196 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1197 Assert(uSrc < 64);
1198 uint64_t fMask = RT_BIT_64(uSrc);
1199 uint64_t uDst = *puDst;
1200 if (uDst & fMask)
1201 {
1202 uDst &= ~fMask;
1203 *puDst = uDst;
1204 *pfEFlags |= X86_EFL_CF;
1205 }
1206 else
1207 {
1208 uDst |= fMask;
1209 *puDst = uDst;
1210 *pfEFlags &= ~X86_EFL_CF;
1211 }
1212}
1213
1214# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1215
1216IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1217{
1218 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1219 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1220 Assert(uSrc < 32);
1221 uint32_t fMask = RT_BIT_32(uSrc);
1222 uint32_t uDst = *puDst;
1223 if (uDst & fMask)
1224 {
1225 uDst &= ~fMask;
1226 *puDst = uDst;
1227 *pfEFlags |= X86_EFL_CF;
1228 }
1229 else
1230 {
1231 uDst |= fMask;
1232 *puDst = uDst;
1233 *pfEFlags &= ~X86_EFL_CF;
1234 }
1235}
1236
1237
1238IEM_DECL_IMPL_DEF(void, iemAImpl_btc_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1239{
1240 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. However, it seems they're
1241 not modified by either AMD (3990x) or Intel (i9-9980HK). */
1242 Assert(uSrc < 16);
1243 uint16_t fMask = RT_BIT_32(uSrc);
1244 uint16_t uDst = *puDst;
1245 if (uDst & fMask)
1246 {
1247 uDst &= ~fMask;
1248 *puDst = uDst;
1249 *pfEFlags |= X86_EFL_CF;
1250 }
1251 else
1252 {
1253 uDst |= fMask;
1254 *puDst = uDst;
1255 *pfEFlags &= ~X86_EFL_CF;
1256 }
1257}
1258
1259# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1260
1261/*
1262 * BTR
1263 */
1264
1265IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1266{
1267 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1268 logical operation (AND/OR/whatever). */
1269 Assert(uSrc < 64);
1270 uint64_t fMask = RT_BIT_64(uSrc);
1271 uint64_t uDst = *puDst;
1272 if (uDst & fMask)
1273 {
1274 uDst &= ~fMask;
1275 *puDst = uDst;
1276 *pfEFlags |= X86_EFL_CF;
1277 }
1278 else
1279 *pfEFlags &= ~X86_EFL_CF;
1280}
1281
1282# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1283
1284IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1285{
1286 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1287 logical operation (AND/OR/whatever). */
1288 Assert(uSrc < 32);
1289 uint32_t fMask = RT_BIT_32(uSrc);
1290 uint32_t uDst = *puDst;
1291 if (uDst & fMask)
1292 {
1293 uDst &= ~fMask;
1294 *puDst = uDst;
1295 *pfEFlags |= X86_EFL_CF;
1296 }
1297 else
1298 *pfEFlags &= ~X86_EFL_CF;
1299}
1300
1301
1302IEM_DECL_IMPL_DEF(void, iemAImpl_btr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1303{
1304 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1305 logical operation (AND/OR/whatever). */
1306 Assert(uSrc < 16);
1307 uint16_t fMask = RT_BIT_32(uSrc);
1308 uint16_t uDst = *puDst;
1309 if (uDst & fMask)
1310 {
1311 uDst &= ~fMask;
1312 *puDst = uDst;
1313 *pfEFlags |= X86_EFL_CF;
1314 }
1315 else
1316 *pfEFlags &= ~X86_EFL_CF;
1317}
1318
1319# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1320
1321/*
1322 * BTS
1323 */
1324
1325IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1326{
1327 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1328 logical operation (AND/OR/whatever). */
1329 Assert(uSrc < 64);
1330 uint64_t fMask = RT_BIT_64(uSrc);
1331 uint64_t uDst = *puDst;
1332 if (uDst & fMask)
1333 *pfEFlags |= X86_EFL_CF;
1334 else
1335 {
1336 uDst |= fMask;
1337 *puDst = uDst;
1338 *pfEFlags &= ~X86_EFL_CF;
1339 }
1340}
1341
1342# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1343
1344IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1345{
1346 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1347 logical operation (AND/OR/whatever). */
1348 Assert(uSrc < 32);
1349 uint32_t fMask = RT_BIT_32(uSrc);
1350 uint32_t uDst = *puDst;
1351 if (uDst & fMask)
1352 *pfEFlags |= X86_EFL_CF;
1353 else
1354 {
1355 uDst |= fMask;
1356 *puDst = uDst;
1357 *pfEFlags &= ~X86_EFL_CF;
1358 }
1359}
1360
1361
1362IEM_DECL_IMPL_DEF(void, iemAImpl_bts_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1363{
1364 /* Note! "undefined" flags: OF, SF, ZF, AF, PF. We set them as after an
1365 logical operation (AND/OR/whatever). */
1366 Assert(uSrc < 16);
1367 uint16_t fMask = RT_BIT_32(uSrc);
1368 uint32_t uDst = *puDst;
1369 if (uDst & fMask)
1370 *pfEFlags |= X86_EFL_CF;
1371 else
1372 {
1373 uDst |= fMask;
1374 *puDst = uDst;
1375 *pfEFlags &= ~X86_EFL_CF;
1376 }
1377}
1378
1379# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1380
1381
1382EMIT_LOCKED_BIN_OP(btc, 64)
1383EMIT_LOCKED_BIN_OP(btr, 64)
1384EMIT_LOCKED_BIN_OP(bts, 64)
1385# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1386EMIT_LOCKED_BIN_OP(btc, 32)
1387EMIT_LOCKED_BIN_OP(btr, 32)
1388EMIT_LOCKED_BIN_OP(bts, 32)
1389
1390EMIT_LOCKED_BIN_OP(btc, 16)
1391EMIT_LOCKED_BIN_OP(btr, 16)
1392EMIT_LOCKED_BIN_OP(bts, 16)
1393# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1394
1395
1396/*
1397 * Helpers for BSR and BSF.
1398 *
1399 * Note! "undefined" flags: OF, SF, AF, PF, CF.
1400 * Intel behavior modelled on 10980xe, AMD on 3990X. Other marchs may
1401 * produce different result (see https://www.sandpile.org/x86/flags.htm),
1402 * but we restrict ourselves to emulating these recent marchs.
1403 */
1404#define SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlag, a_iBit) do { \
1405 unsigned iBit = (a_iBit); \
1406 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1407 if (iBit) \
1408 { \
1409 *puDst = --iBit; \
1410 fEfl |= g_afParity[iBit]; \
1411 } \
1412 else \
1413 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1414 *pfEFlags = fEfl; \
1415 } while (0)
1416#define SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlag, a_iBit) do { \
1417 unsigned const iBit = (a_iBit); \
1418 if (iBit) \
1419 { \
1420 *puDst = iBit - 1; \
1421 *pfEFlags &= ~X86_EFL_ZF; \
1422 } \
1423 else \
1424 *pfEFlags |= X86_EFL_ZF; \
1425 } while (0)
1426
1427
1428/*
1429 * BSF - first (least significant) bit set
1430 */
1431IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1432{
1433 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1434}
1435
1436IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1437{
1438 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1439}
1440
1441IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1442{
1443 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU64(uSrc));
1444}
1445
1446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1447
1448IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1449{
1450 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1451}
1452
1453IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1454{
1455 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1456}
1457
1458IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1459{
1460 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU32(uSrc));
1461}
1462
1463
1464IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1465{
1466 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1467}
1468
1469IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1470{
1471 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1472}
1473
1474IEM_DECL_IMPL_DEF(void, iemAImpl_bsf_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1475{
1476 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitFirstSetU16(uSrc));
1477}
1478
1479# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1480
1481
1482/*
1483 * BSR - last (most significant) bit set
1484 */
1485IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1486{
1487 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1488}
1489
1490IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1491{
1492 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1493}
1494
1495IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1496{
1497 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU64(uSrc));
1498}
1499
1500# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1501
1502IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1503{
1504 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1505}
1506
1507IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1508{
1509 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1510}
1511
1512IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1513{
1514 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU32(uSrc));
1515}
1516
1517
1518IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1519{
1520 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1521}
1522
1523IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1524{
1525 SET_BIT_SEARCH_RESULT_INTEL(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1526}
1527
1528IEM_DECL_IMPL_DEF(void, iemAImpl_bsr_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1529{
1530 SET_BIT_SEARCH_RESULT_AMD(puDst, pfEFlags, ASMBitLastSetU16(uSrc));
1531}
1532
1533# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1534
1535
1536/*
1537 * Helpers for LZCNT and TZCNT.
1538 */
1539#define SET_BIT_CNT_SEARCH_RESULT_INTEL(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1540 unsigned const uResult = (a_uResult); \
1541 *(a_puDst) = uResult; \
1542 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1543 if (uResult) \
1544 fEfl |= g_afParity[uResult]; \
1545 else \
1546 fEfl |= X86_EFL_ZF | X86_EFL_PF; \
1547 if (!a_uSrc) \
1548 fEfl |= X86_EFL_CF; \
1549 *(a_pfEFlags) = fEfl; \
1550 } while (0)
1551#define SET_BIT_CNT_SEARCH_RESULT_AMD(a_puDst, a_uSrc, a_pfEFlags, a_uResult) do { \
1552 unsigned const uResult = (a_uResult); \
1553 *(a_puDst) = uResult; \
1554 uint32_t fEfl = *(a_pfEFlags) & ~(X86_EFL_ZF | X86_EFL_CF); \
1555 if (!uResult) \
1556 fEfl |= X86_EFL_ZF; \
1557 if (!a_uSrc) \
1558 fEfl |= X86_EFL_CF; \
1559 *(a_pfEFlags) = fEfl; \
1560 } while (0)
1561
1562
1563/*
1564 * LZCNT - count leading zero bits.
1565 */
1566IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1567{
1568 iemAImpl_lzcnt_u64_intel(puDst, uSrc, pfEFlags);
1569}
1570
1571IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1572{
1573 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1574}
1575
1576IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1577{
1578 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU64(uSrc));
1579}
1580
1581# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1582
1583IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1584{
1585 iemAImpl_lzcnt_u32_intel(puDst, uSrc, pfEFlags);
1586}
1587
1588IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1589{
1590 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1591}
1592
1593IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1594{
1595 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU32(uSrc));
1596}
1597
1598
1599IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1600{
1601 iemAImpl_lzcnt_u16_intel(puDst, uSrc, pfEFlags);
1602}
1603
1604IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1605{
1606 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1607}
1608
1609IEM_DECL_IMPL_DEF(void, iemAImpl_lzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1610{
1611 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountLeadingZerosU16(uSrc));
1612}
1613
1614# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1615
1616
1617/*
1618 * TZCNT - count leading zero bits.
1619 */
1620IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1621{
1622 iemAImpl_tzcnt_u64_intel(puDst, uSrc, pfEFlags);
1623}
1624
1625IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_intel,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1626{
1627 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1628}
1629
1630IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u64_amd,(uint64_t *puDst, uint64_t uSrc, uint32_t *pfEFlags))
1631{
1632 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU64(uSrc));
1633}
1634
1635# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1636
1637IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1638{
1639 iemAImpl_tzcnt_u32_intel(puDst, uSrc, pfEFlags);
1640}
1641
1642IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_intel,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1643{
1644 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1645}
1646
1647IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u32_amd,(uint32_t *puDst, uint32_t uSrc, uint32_t *pfEFlags))
1648{
1649 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU32(uSrc));
1650}
1651
1652
1653IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1654{
1655 iemAImpl_tzcnt_u16_intel(puDst, uSrc, pfEFlags);
1656}
1657
1658IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_intel,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1659{
1660 SET_BIT_CNT_SEARCH_RESULT_INTEL(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1661}
1662
1663IEM_DECL_IMPL_DEF(void, iemAImpl_tzcnt_u16_amd,(uint16_t *puDst, uint16_t uSrc, uint32_t *pfEFlags))
1664{
1665 SET_BIT_CNT_SEARCH_RESULT_AMD(puDst, uSrc, pfEFlags, ASMCountTrailingZerosU16(uSrc));
1666}
1667
1668# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1669#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
1670
1671/*
1672 * BEXTR (BMI1 instruction)
1673 */
1674#define EMIT_BEXTR(a_cBits, a_Type, a_Suffix) \
1675IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bextr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1676 a_Type uSrc2, uint32_t *pfEFlags)) \
1677{ \
1678 /* uSrc1 is considered virtually zero extended to 512 bits width. */ \
1679 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1680 a_Type uResult; \
1681 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1682 if (iFirstBit < a_cBits) \
1683 { \
1684 uResult = uSrc1 >> iFirstBit; \
1685 uint8_t const cBits = (uint8_t)(uSrc2 >> 8); \
1686 if (cBits < a_cBits) \
1687 uResult &= RT_CONCAT(RT_BIT_,a_cBits)(cBits) - 1; \
1688 *puDst = uResult; \
1689 if (!uResult) \
1690 fEfl |= X86_EFL_ZF; \
1691 } \
1692 else \
1693 { \
1694 *puDst = uResult = 0; \
1695 fEfl |= X86_EFL_ZF; \
1696 } \
1697 /** @todo complete flag calculations. */ \
1698 *pfEFlags = fEfl; \
1699}
1700
1701EMIT_BEXTR(64, uint64_t, _fallback)
1702EMIT_BEXTR(32, uint32_t, _fallback)
1703#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1704EMIT_BEXTR(64, uint64_t, RT_NOTHING)
1705#endif
1706#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1707EMIT_BEXTR(32, uint32_t, RT_NOTHING)
1708#endif
1709
1710/*
1711 * BLSR (BMI1 instruction)
1712 */
1713#define EMIT_BLSR(a_cBits, a_Type, a_Suffix) \
1714IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsr_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1715{ \
1716 uint32_t fEfl1 = *pfEFlags; \
1717 uint32_t fEfl2 = fEfl1; \
1718 *puDst = uSrc; \
1719 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1720 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1721 \
1722 /* AMD: The carry flag is from the SUB operation. */ \
1723 /* 10890xe: PF always cleared? */ \
1724 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1725 fEfl2 |= fEfl1 & X86_EFL_CF; \
1726 *pfEFlags = fEfl2; \
1727}
1728
1729EMIT_BLSR(64, uint64_t, _fallback)
1730EMIT_BLSR(32, uint32_t, _fallback)
1731#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1732EMIT_BLSR(64, uint64_t, RT_NOTHING)
1733#endif
1734#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1735EMIT_BLSR(32, uint32_t, RT_NOTHING)
1736#endif
1737
1738/*
1739 * BLSMSK (BMI1 instruction)
1740 */
1741#define EMIT_BLSMSK(a_cBits, a_Type, a_Suffix) \
1742IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsmsk_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1743{ \
1744 uint32_t fEfl1 = *pfEFlags; \
1745 uint32_t fEfl2 = fEfl1; \
1746 *puDst = uSrc; \
1747 iemAImpl_sub_u ## a_cBits(&uSrc, 1, &fEfl1); \
1748 iemAImpl_xor_u ## a_cBits(puDst, uSrc, &fEfl2); \
1749 \
1750 /* AMD: The carry flag is from the SUB operation. */ \
1751 /* 10890xe: PF always cleared? */ \
1752 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1753 fEfl2 |= fEfl1 & X86_EFL_CF; \
1754 *pfEFlags = fEfl2; \
1755}
1756
1757EMIT_BLSMSK(64, uint64_t, _fallback)
1758EMIT_BLSMSK(32, uint32_t, _fallback)
1759#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1760EMIT_BLSMSK(64, uint64_t, RT_NOTHING)
1761#endif
1762#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1763EMIT_BLSMSK(32, uint32_t, RT_NOTHING)
1764#endif
1765
1766/*
1767 * BLSI (BMI1 instruction)
1768 */
1769#define EMIT_BLSI(a_cBits, a_Type, a_Suffix) \
1770IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_blsi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1771{ \
1772 uint32_t fEfl1 = *pfEFlags; \
1773 uint32_t fEfl2 = fEfl1; \
1774 *puDst = uSrc; \
1775 iemAImpl_neg_u ## a_cBits(&uSrc, &fEfl1); \
1776 iemAImpl_and_u ## a_cBits(puDst, uSrc, &fEfl2); \
1777 \
1778 /* AMD: The carry flag is from the SUB operation. */ \
1779 /* 10890xe: PF always cleared? */ \
1780 fEfl2 &= ~(X86_EFL_CF | X86_EFL_PF); \
1781 fEfl2 |= fEfl1 & X86_EFL_CF; \
1782 *pfEFlags = fEfl2; \
1783}
1784
1785EMIT_BLSI(64, uint64_t, _fallback)
1786EMIT_BLSI(32, uint32_t, _fallback)
1787#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1788EMIT_BLSI(64, uint64_t, RT_NOTHING)
1789#endif
1790#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1791EMIT_BLSI(32, uint32_t, RT_NOTHING)
1792#endif
1793
1794/*
1795 * BZHI (BMI2 instruction)
1796 */
1797#define EMIT_BZHI(a_cBits, a_Type, a_Suffix) \
1798IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_bzhi_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc1, \
1799 a_Type uSrc2, uint32_t *pfEFlags)) \
1800{ \
1801 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1802 a_Type uResult; \
1803 uint8_t const iFirstBit = (uint8_t)uSrc2; \
1804 if (iFirstBit < a_cBits) \
1805 uResult = uSrc1 & (((a_Type)1 << iFirstBit) - 1); \
1806 else \
1807 { \
1808 uResult = uSrc1; \
1809 fEfl |= X86_EFL_CF; \
1810 } \
1811 *puDst = uResult; \
1812 fEfl |= X86_EFL_CALC_ZF(uResult); \
1813 fEfl |= X86_EFL_CALC_SF(uResult, a_cBits); \
1814 *pfEFlags = fEfl; \
1815}
1816
1817EMIT_BZHI(64, uint64_t, _fallback)
1818EMIT_BZHI(32, uint32_t, _fallback)
1819#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1820EMIT_BZHI(64, uint64_t, RT_NOTHING)
1821#endif
1822#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1823EMIT_BZHI(32, uint32_t, RT_NOTHING)
1824#endif
1825
1826/*
1827 * POPCNT
1828 */
1829RT_ALIGNAS_VAR(64) static uint8_t const g_abBitCounts6[64] =
1830{
1831 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1832 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1833 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1834 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1835};
1836
1837/** @todo Use native popcount where possible and employ some more efficient
1838 * algorithm here (or in asm.h fallback)! */
1839
1840DECLINLINE(uint8_t) iemPopCountU16(uint16_t u16)
1841{
1842 return g_abBitCounts6[ u16 & 0x3f]
1843 + g_abBitCounts6[(u16 >> 6) & 0x3f]
1844 + g_abBitCounts6[(u16 >> 12) & 0x3f];
1845}
1846
1847DECLINLINE(uint8_t) iemPopCountU32(uint32_t u32)
1848{
1849 return g_abBitCounts6[ u32 & 0x3f]
1850 + g_abBitCounts6[(u32 >> 6) & 0x3f]
1851 + g_abBitCounts6[(u32 >> 12) & 0x3f]
1852 + g_abBitCounts6[(u32 >> 18) & 0x3f]
1853 + g_abBitCounts6[(u32 >> 24) & 0x3f]
1854 + g_abBitCounts6[(u32 >> 30) & 0x3f];
1855}
1856
1857DECLINLINE(uint8_t) iemPopCountU64(uint64_t u64)
1858{
1859 return g_abBitCounts6[ u64 & 0x3f]
1860 + g_abBitCounts6[(u64 >> 6) & 0x3f]
1861 + g_abBitCounts6[(u64 >> 12) & 0x3f]
1862 + g_abBitCounts6[(u64 >> 18) & 0x3f]
1863 + g_abBitCounts6[(u64 >> 24) & 0x3f]
1864 + g_abBitCounts6[(u64 >> 30) & 0x3f]
1865 + g_abBitCounts6[(u64 >> 36) & 0x3f]
1866 + g_abBitCounts6[(u64 >> 42) & 0x3f]
1867 + g_abBitCounts6[(u64 >> 48) & 0x3f]
1868 + g_abBitCounts6[(u64 >> 54) & 0x3f]
1869 + g_abBitCounts6[(u64 >> 60) & 0x3f];
1870}
1871
1872#define EMIT_POPCNT(a_cBits, a_Type, a_Suffix) \
1873IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_popcnt_u,a_cBits,a_Suffix),(a_Type *puDst, a_Type uSrc, uint32_t *pfEFlags)) \
1874{ \
1875 uint32_t fEfl = *pfEFlags & ~(X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF); \
1876 a_Type uResult; \
1877 if (uSrc) \
1878 uResult = iemPopCountU ## a_cBits(uSrc); \
1879 else \
1880 { \
1881 fEfl |= X86_EFL_ZF; \
1882 uResult = 0; \
1883 } \
1884 *puDst = uResult; \
1885 *pfEFlags = fEfl; \
1886}
1887
1888EMIT_POPCNT(64, uint64_t, _fallback)
1889EMIT_POPCNT(32, uint32_t, _fallback)
1890EMIT_POPCNT(16, uint16_t, _fallback)
1891#if defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1892EMIT_POPCNT(64, uint64_t, RT_NOTHING)
1893#endif
1894#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
1895EMIT_POPCNT(32, uint32_t, RT_NOTHING)
1896EMIT_POPCNT(16, uint16_t, RT_NOTHING)
1897#endif
1898
1899
1900#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
1901
1902/*
1903 * XCHG
1904 */
1905
1906IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_locked,(uint64_t *puMem, uint64_t *puReg))
1907{
1908#if ARCH_BITS >= 64
1909 *puReg = ASMAtomicXchgU64(puMem, *puReg);
1910#else
1911 uint64_t uOldMem = *puMem;
1912 while (!ASMAtomicCmpXchgExU64(puMem, *puReg, uOldMem, &uOldMem))
1913 ASMNopPause();
1914 *puReg = uOldMem;
1915#endif
1916}
1917
1918# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1919
1920IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_locked,(uint32_t *puMem, uint32_t *puReg))
1921{
1922 *puReg = ASMAtomicXchgU32(puMem, *puReg);
1923}
1924
1925
1926IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_locked,(uint16_t *puMem, uint16_t *puReg))
1927{
1928 *puReg = ASMAtomicXchgU16(puMem, *puReg);
1929}
1930
1931
1932IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_locked,(uint8_t *puMem, uint8_t *puReg))
1933{
1934 *puReg = ASMAtomicXchgU8(puMem, *puReg);
1935}
1936
1937# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1938
1939
1940/* Unlocked variants for fDisregardLock mode: */
1941
1942IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u64_unlocked,(uint64_t *puMem, uint64_t *puReg))
1943{
1944 uint64_t const uOld = *puMem;
1945 *puMem = *puReg;
1946 *puReg = uOld;
1947}
1948
1949# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
1950
1951IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u32_unlocked,(uint32_t *puMem, uint32_t *puReg))
1952{
1953 uint32_t const uOld = *puMem;
1954 *puMem = *puReg;
1955 *puReg = uOld;
1956}
1957
1958
1959IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u16_unlocked,(uint16_t *puMem, uint16_t *puReg))
1960{
1961 uint16_t const uOld = *puMem;
1962 *puMem = *puReg;
1963 *puReg = uOld;
1964}
1965
1966
1967IEM_DECL_IMPL_DEF(void, iemAImpl_xchg_u8_unlocked,(uint8_t *puMem, uint8_t *puReg))
1968{
1969 uint8_t const uOld = *puMem;
1970 *puMem = *puReg;
1971 *puReg = uOld;
1972}
1973
1974# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
1975
1976
1977/*
1978 * XADD and LOCK XADD.
1979 */
1980#define EMIT_XADD(a_cBitsWidth, a_Type) \
1981IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1982{ \
1983 a_Type uDst = *puDst; \
1984 a_Type uResult = uDst; \
1985 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, pfEFlags); \
1986 *puDst = uResult; \
1987 *puReg = uDst; \
1988} \
1989\
1990IEM_DECL_IMPL_DEF(void, iemAImpl_xadd_u ## a_cBitsWidth ## _locked,(a_Type *puDst, a_Type *puReg, uint32_t *pfEFlags)) \
1991{ \
1992 a_Type uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
1993 a_Type uResult; \
1994 uint32_t fEflTmp; \
1995 do \
1996 { \
1997 uResult = uOld; \
1998 fEflTmp = *pfEFlags; \
1999 iemAImpl_add_u ## a_cBitsWidth(&uResult, *puReg, &fEflTmp); \
2000 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uResult, uOld, &uOld)); \
2001 *puReg = uOld; \
2002 *pfEFlags = fEflTmp; \
2003}
2004EMIT_XADD(64, uint64_t)
2005# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2006EMIT_XADD(32, uint32_t)
2007EMIT_XADD(16, uint16_t)
2008EMIT_XADD(8, uint8_t)
2009# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2010
2011#endif
2012
2013/*
2014 * CMPXCHG, CMPXCHG8B, CMPXCHG16B
2015 *
2016 * Note! We don't have non-locking/atomic cmpxchg primitives, so all cmpxchg
2017 * instructions are emulated as locked.
2018 */
2019#if defined(IEM_WITHOUT_ASSEMBLY)
2020
2021IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8_locked, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2022{
2023 uint8_t uOld = *puAl;
2024 if (ASMAtomicCmpXchgExU8(pu8Dst, uSrcReg, uOld, puAl))
2025 Assert(*puAl == uOld);
2026 iemAImpl_cmp_u8(&uOld, *puAl, pEFlags);
2027}
2028
2029
2030IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16_locked,(uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2031{
2032 uint16_t uOld = *puAx;
2033 if (ASMAtomicCmpXchgExU16(pu16Dst, uSrcReg, uOld, puAx))
2034 Assert(*puAx == uOld);
2035 iemAImpl_cmp_u16(&uOld, *puAx, pEFlags);
2036}
2037
2038
2039IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32_locked,(uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2040{
2041 uint32_t uOld = *puEax;
2042 if (ASMAtomicCmpXchgExU32(pu32Dst, uSrcReg, uOld, puEax))
2043 Assert(*puEax == uOld);
2044 iemAImpl_cmp_u32(&uOld, *puEax, pEFlags);
2045}
2046
2047
2048# if ARCH_BITS == 32
2049IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2050# else
2051IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64_locked,(uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2052# endif
2053{
2054# if ARCH_BITS == 32
2055 uint64_t const uSrcReg = *puSrcReg;
2056# endif
2057 uint64_t uOld = *puRax;
2058 if (ASMAtomicCmpXchgExU64(pu64Dst, uSrcReg, uOld, puRax))
2059 Assert(*puRax == uOld);
2060 iemAImpl_cmp_u64(&uOld, *puRax, pEFlags);
2061}
2062
2063
2064IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b_locked,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
2065 uint32_t *pEFlags))
2066{
2067 uint64_t const uNew = pu64EbxEcx->u;
2068 uint64_t const uOld = pu64EaxEdx->u;
2069 if (ASMAtomicCmpXchgExU64(pu64Dst, uNew, uOld, &pu64EaxEdx->u))
2070 {
2071 Assert(pu64EaxEdx->u == uOld);
2072 *pEFlags |= X86_EFL_ZF;
2073 }
2074 else
2075 *pEFlags &= ~X86_EFL_ZF;
2076}
2077
2078
2079# if defined(RT_ARCH_AMD64) || defined(RT_ARCH_ARM64)
2080IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_locked,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2081 uint32_t *pEFlags))
2082{
2083# ifdef VBOX_STRICT
2084 RTUINT128U const uOld = *pu128RaxRdx;
2085# endif
2086# if defined(RT_ARCH_AMD64)
2087 if (ASMAtomicCmpXchgU128v2(&pu128Dst->u, pu128RbxRcx->s.Hi, pu128RbxRcx->s.Lo, pu128RaxRdx->s.Hi, pu128RaxRdx->s.Lo,
2088 &pu128RaxRdx->u))
2089# else
2090 if (ASMAtomicCmpXchgU128(&pu128Dst->u, pu128RbxRcx->u, pu128RaxRdx->u, &pu128RaxRdx->u))
2091# endif
2092 {
2093 Assert(pu128RaxRdx->s.Lo == uOld.s.Lo && pu128RaxRdx->s.Hi == uOld.s.Hi);
2094 *pEFlags |= X86_EFL_ZF;
2095 }
2096 else
2097 *pEFlags &= ~X86_EFL_ZF;
2098}
2099# endif
2100
2101#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2102
2103# if !defined(RT_ARCH_ARM64) /** @todo may need this for unaligned accesses... */
2104IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b_fallback,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx,
2105 PRTUINT128U pu128RbxRcx, uint32_t *pEFlags))
2106{
2107 RTUINT128U u128Tmp = *pu128Dst;
2108 if ( u128Tmp.s.Lo == pu128RaxRdx->s.Lo
2109 && u128Tmp.s.Hi == pu128RaxRdx->s.Hi)
2110 {
2111 *pu128Dst = *pu128RbxRcx;
2112 *pEFlags |= X86_EFL_ZF;
2113 }
2114 else
2115 {
2116 *pu128RaxRdx = u128Tmp;
2117 *pEFlags &= ~X86_EFL_ZF;
2118 }
2119}
2120#endif /* !RT_ARCH_ARM64 */
2121
2122#if defined(IEM_WITHOUT_ASSEMBLY)
2123
2124/* Unlocked versions mapped to the locked ones: */
2125
2126IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u8, (uint8_t *pu8Dst, uint8_t *puAl, uint8_t uSrcReg, uint32_t *pEFlags))
2127{
2128 iemAImpl_cmpxchg_u8_locked(pu8Dst, puAl, uSrcReg, pEFlags);
2129}
2130
2131
2132IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u16, (uint16_t *pu16Dst, uint16_t *puAx, uint16_t uSrcReg, uint32_t *pEFlags))
2133{
2134 iemAImpl_cmpxchg_u16_locked(pu16Dst, puAx, uSrcReg, pEFlags);
2135}
2136
2137
2138IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u32, (uint32_t *pu32Dst, uint32_t *puEax, uint32_t uSrcReg, uint32_t *pEFlags))
2139{
2140 iemAImpl_cmpxchg_u32_locked(pu32Dst, puEax, uSrcReg, pEFlags);
2141}
2142
2143
2144# if ARCH_BITS == 32
2145IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t *puSrcReg, uint32_t *pEFlags))
2146{
2147 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, puSrcReg, pEFlags);
2148}
2149# else
2150IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg_u64, (uint64_t *pu64Dst, uint64_t *puRax, uint64_t uSrcReg, uint32_t *pEFlags))
2151{
2152 iemAImpl_cmpxchg_u64_locked(pu64Dst, puRax, uSrcReg, pEFlags);
2153}
2154# endif
2155
2156
2157IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx, uint32_t *pEFlags))
2158{
2159 iemAImpl_cmpxchg8b_locked(pu64Dst, pu64EaxEdx, pu64EbxEcx, pEFlags);
2160}
2161
2162
2163IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu128RaxRdx, PRTUINT128U pu128RbxRcx,
2164 uint32_t *pEFlags))
2165{
2166 iemAImpl_cmpxchg16b_locked(pu128Dst, pu128RaxRdx, pu128RbxRcx, pEFlags);
2167}
2168
2169#endif /* defined(IEM_WITHOUT_ASSEMBLY) */
2170
2171#if (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) \
2172 && !defined(DOXYGEN_RUNNING) /* Doxygen has some groking issues here and ends up mixing up input. Not worth tracking down now. */
2173
2174/*
2175 * MUL, IMUL, DIV and IDIV helpers.
2176 *
2177 * - The U64 versions must use 128-bit intermediates, so we need to abstract the
2178 * division step so we can select between using C operators and
2179 * RTUInt128DivRem/RTUInt128MulU64ByU64.
2180 *
2181 * - The U8 versions work returns output in AL + AH instead of xDX + xAX, with the
2182 * IDIV/DIV taking all the input in AX too. This means we have to abstract some
2183 * input loads and the result storing.
2184 */
2185
2186DECLINLINE(void) RTUInt128DivRemByU64(PRTUINT128U pQuotient, PRTUINT128U pRemainder, PCRTUINT128U pDividend, uint64_t u64Divisor)
2187{
2188# ifdef __GNUC__ /* GCC maybe really annoying in function. */
2189 pQuotient->s.Lo = 0;
2190 pQuotient->s.Hi = 0;
2191# endif
2192 RTUINT128U Divisor;
2193 Divisor.s.Lo = u64Divisor;
2194 Divisor.s.Hi = 0;
2195 RTUInt128DivRem(pQuotient, pRemainder, pDividend, &Divisor);
2196}
2197
2198# define DIV_LOAD(a_Dividend) \
2199 a_Dividend.s.Lo = *puA, a_Dividend.s.Hi = *puD
2200# define DIV_LOAD_U8(a_Dividend) \
2201 a_Dividend.u = *puAX
2202
2203# define DIV_STORE(a_Quotient, a_uReminder) *puA = (a_Quotient), *puD = (a_uReminder)
2204# define DIV_STORE_U8(a_Quotient, a_uReminder) *puAX = (uint8_t)(a_Quotient) | ((uint16_t)(a_uReminder) << 8)
2205
2206# define MUL_LOAD_F1() *puA
2207# define MUL_LOAD_F1_U8() ((uint8_t)*puAX)
2208
2209# define MUL_STORE(a_Result) *puA = (a_Result).s.Lo, *puD = (a_Result).s.Hi
2210# define MUL_STORE_U8(a_Result) *puAX = a_Result.u
2211
2212# define MULDIV_NEG(a_Value, a_cBitsWidth2x) \
2213 (a_Value).u = UINT ## a_cBitsWidth2x ## _C(0) - (a_Value).u
2214# define MULDIV_NEG_U128(a_Value, a_cBitsWidth2x) \
2215 RTUInt128AssignNeg(&(a_Value))
2216
2217# define MULDIV_MUL(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2218 (a_Result).u = (uint ## a_cBitsWidth2x ## _t)(a_Factor1) * (a_Factor2)
2219# define MULDIV_MUL_U128(a_Result, a_Factor1, a_Factor2, a_cBitsWidth2x) \
2220 RTUInt128MulU64ByU64(&(a_Result), a_Factor1, a_Factor2);
2221
2222# define MULDIV_MODDIV(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2223 a_Quotient.u = (a_Dividend).u / (a_uDivisor), \
2224 a_Remainder.u = (a_Dividend).u % (a_uDivisor)
2225# define MULDIV_MODDIV_U128(a_Quotient, a_Remainder, a_Dividend, a_uDivisor) \
2226 RTUInt128DivRemByU64(&a_Quotient, &a_Remainder, &a_Dividend, a_uDivisor)
2227
2228
2229/*
2230 * MUL
2231 */
2232# define EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, a_Suffix, a_fIntelFlags) \
2233IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_mul_u,a_cBitsWidth,a_Suffix), a_Args) \
2234{ \
2235 RTUINT ## a_cBitsWidth2x ## U Result; \
2236 a_fnMul(Result, a_fnLoadF1(), uFactor, a_cBitsWidth2x); \
2237 a_fnStore(Result); \
2238 \
2239 /* Calc EFLAGS: */ \
2240 uint32_t fEfl = *pfEFlags; \
2241 if (a_fIntelFlags) \
2242 { /* Intel: 6700K and 10980XE behavior */ \
2243 fEfl &= ~(X86_EFL_SF | X86_EFL_CF | X86_EFL_OF | X86_EFL_AF | X86_EFL_ZF | X86_EFL_PF); \
2244 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2245 fEfl |= X86_EFL_SF; \
2246 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2247 if (Result.s.Hi != 0) \
2248 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2249 } \
2250 else \
2251 { /* AMD: 3990X */ \
2252 if (Result.s.Hi != 0) \
2253 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2254 else \
2255 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2256 } \
2257 *pfEFlags = fEfl; \
2258 return 0; \
2259} \
2260
2261# define EMIT_MUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul) \
2262 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, RT_NOTHING, 1) \
2263 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _intel, 1) \
2264 EMIT_MUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnMul, _amd, 0) \
2265
2266# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2267EMIT_MUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2268 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL_U128)
2269# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2270EMIT_MUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2271 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2272EMIT_MUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor, uint32_t *pfEFlags), (puA, puD, uFactor, pfEFlags),
2273 MUL_LOAD_F1, MUL_STORE, MULDIV_MUL)
2274EMIT_MUL(8, 16, (uint16_t *puAX, uint8_t uFactor, uint32_t *pfEFlags), (puAX, uFactor, pfEFlags),
2275 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_MUL)
2276# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2277# endif /* !DOXYGEN_RUNNING */
2278
2279/*
2280 * MULX
2281 */
2282# define EMIT_MULX(a_cBitsWidth, a_cBitsWidth2x, a_uType, a_fnMul, a_Suffix) \
2283IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_mulx_u,a_cBitsWidth,a_Suffix), \
2284 (a_uType *puDst1, a_uType *puDst2, a_uType uSrc1, a_uType uSrc2)) \
2285{ \
2286 RTUINT ## a_cBitsWidth2x ## U Result; \
2287 a_fnMul(Result, uSrc1, uSrc2, a_cBitsWidth2x); \
2288 *puDst2 = Result.s.Lo; /* Lower part first, as we should return the high part when puDst2 == puDst1. */ \
2289 *puDst1 = Result.s.Hi; \
2290} \
2291
2292# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2293EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, RT_NOTHING)
2294EMIT_MULX(64, 128, uint64_t, MULDIV_MUL_U128, _fallback)
2295# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2296EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, RT_NOTHING)
2297EMIT_MULX(32, 64, uint32_t, MULDIV_MUL, _fallback)
2298# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2299# endif /* !DOXYGEN_RUNNING */
2300
2301
2302/*
2303 * IMUL
2304 *
2305 * The SF, ZF, AF and PF flags are "undefined". AMD (3990x) leaves these
2306 * flags as is. Whereas Intel skylake (6700K and 10980X (Cascade Lake)) always
2307 * clear AF and ZF and calculates SF and PF as per the lower half of the result.
2308 */
2309# define EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, \
2310 a_Suffix, a_fIntelFlags) \
2311IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_imul_u,a_cBitsWidth,a_Suffix),a_Args) \
2312{ \
2313 RTUINT ## a_cBitsWidth2x ## U Result; \
2314 uint32_t fEfl = *pfEFlags & ~(X86_EFL_CF | X86_EFL_OF); \
2315 \
2316 uint ## a_cBitsWidth ## _t const uFactor1 = a_fnLoadF1(); \
2317 if (!(uFactor1 & RT_BIT_64(a_cBitsWidth - 1))) \
2318 { \
2319 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2320 { \
2321 a_fnMul(Result, uFactor1, uFactor2, a_cBitsWidth2x); \
2322 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2323 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2324 } \
2325 else \
2326 { \
2327 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2328 a_fnMul(Result, uFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2329 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2330 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2331 a_fnNeg(Result, a_cBitsWidth2x); \
2332 } \
2333 } \
2334 else \
2335 { \
2336 if (!(uFactor2 & RT_BIT_64(a_cBitsWidth - 1))) \
2337 { \
2338 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2339 a_fnMul(Result, uPositiveFactor1, uFactor2, a_cBitsWidth2x); \
2340 if (Result.s.Hi != 0 || Result.s.Lo > RT_BIT_64(a_cBitsWidth - 1)) \
2341 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2342 a_fnNeg(Result, a_cBitsWidth2x); \
2343 } \
2344 else \
2345 { \
2346 uint ## a_cBitsWidth ## _t const uPositiveFactor1 = UINT ## a_cBitsWidth ## _C(0) - uFactor1; \
2347 uint ## a_cBitsWidth ## _t const uPositiveFactor2 = UINT ## a_cBitsWidth ## _C(0) - uFactor2; \
2348 a_fnMul(Result, uPositiveFactor1, uPositiveFactor2, a_cBitsWidth2x); \
2349 if (Result.s.Hi != 0 || Result.s.Lo >= RT_BIT_64(a_cBitsWidth - 1)) \
2350 fEfl |= X86_EFL_CF | X86_EFL_OF; \
2351 } \
2352 } \
2353 a_fnStore(Result); \
2354 \
2355 if (a_fIntelFlags) \
2356 { \
2357 fEfl &= ~(X86_EFL_AF | X86_EFL_ZF | X86_EFL_SF | X86_EFL_PF); \
2358 if (Result.s.Lo & RT_BIT_64(a_cBitsWidth - 1)) \
2359 fEfl |= X86_EFL_SF; \
2360 fEfl |= g_afParity[Result.s.Lo & 0xff]; \
2361 } \
2362 *pfEFlags = fEfl; \
2363 return 0; \
2364}
2365# define EMIT_IMUL(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul) \
2366 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, RT_NOTHING, 1) \
2367 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _intel, 1) \
2368 EMIT_IMUL_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoadF1, a_fnStore, a_fnNeg, a_fnMul, _amd, 0)
2369
2370# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2371EMIT_IMUL(64, 128, (uint64_t *puA, uint64_t *puD, uint64_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2372 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG_U128, MULDIV_MUL_U128)
2373# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2374EMIT_IMUL(32, 64, (uint32_t *puA, uint32_t *puD, uint32_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2375 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2376EMIT_IMUL(16, 32, (uint16_t *puA, uint16_t *puD, uint16_t uFactor2, uint32_t *pfEFlags), (puA, puD, uFactor2, pfEFlags),
2377 MUL_LOAD_F1, MUL_STORE, MULDIV_NEG, MULDIV_MUL)
2378EMIT_IMUL(8, 16, (uint16_t *puAX, uint8_t uFactor2, uint32_t *pfEFlags), (puAX, uFactor2, pfEFlags),
2379 MUL_LOAD_F1_U8, MUL_STORE_U8, MULDIV_NEG, MULDIV_MUL)
2380# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2381# endif /* !DOXYGEN_RUNNING */
2382
2383
2384/*
2385 * IMUL with two operands are mapped onto the three operand variant, ignoring
2386 * the high part of the product.
2387 */
2388# define EMIT_IMUL_TWO(a_cBits, a_uType) \
2389IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2390{ \
2391 a_uType uIgn; \
2392 iemAImpl_imul_u ## a_cBits(puDst, &uIgn, uSrc, pfEFlags); \
2393} \
2394\
2395IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _intel,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2396{ \
2397 a_uType uIgn; \
2398 iemAImpl_imul_u ## a_cBits ## _intel(puDst, &uIgn, uSrc, pfEFlags); \
2399} \
2400\
2401IEM_DECL_IMPL_DEF(void, iemAImpl_imul_two_u ## a_cBits ## _amd,(a_uType *puDst, a_uType uSrc, uint32_t *pfEFlags)) \
2402{ \
2403 a_uType uIgn; \
2404 iemAImpl_imul_u ## a_cBits ## _amd(puDst, &uIgn, uSrc, pfEFlags); \
2405}
2406
2407EMIT_IMUL_TWO(64, uint64_t)
2408# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2409EMIT_IMUL_TWO(32, uint32_t)
2410EMIT_IMUL_TWO(16, uint16_t)
2411# endif
2412
2413
2414/*
2415 * DIV
2416 */
2417# define EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, \
2418 a_Suffix, a_fIntelFlags) \
2419IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_div_u,a_cBitsWidth,a_Suffix),a_Args) \
2420{ \
2421 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2422 a_fnLoad(Dividend); \
2423 if ( uDivisor != 0 \
2424 && Dividend.s.Hi < uDivisor) \
2425 { \
2426 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2427 a_fnDivRem(Quotient, Remainder, Dividend, uDivisor); \
2428 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2429 \
2430 /* Calc EFLAGS: Intel 6700K and 10980XE leaves them alone. AMD 3990X sets AF and clears PF, ZF and SF. */ \
2431 if (!a_fIntelFlags) \
2432 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2433 return 0; \
2434 } \
2435 /* #DE */ \
2436 return -1; \
2437}
2438# define EMIT_DIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem) \
2439 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, RT_NOTHING, 1) \
2440 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _intel, 1) \
2441 EMIT_DIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnDivRem, _amd, 0)
2442
2443# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2444EMIT_DIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2445 DIV_LOAD, DIV_STORE, MULDIV_MODDIV_U128)
2446# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2447EMIT_DIV(32,64, (uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2448 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2449EMIT_DIV(16,32, (uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2450 DIV_LOAD, DIV_STORE, MULDIV_MODDIV)
2451EMIT_DIV(8,16, (uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2452 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_MODDIV)
2453# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2454# endif /* !DOXYGEN_RUNNING */
2455
2456
2457/*
2458 * IDIV
2459 *
2460 * EFLAGS are ignored and left as-is by Intel 6700K and 10980XE. AMD 3990X will
2461 * set AF and clear PF, ZF and SF just like it does for DIV.
2462 *
2463 */
2464# define EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, \
2465 a_Suffix, a_fIntelFlags) \
2466IEM_DECL_IMPL_DEF(int, RT_CONCAT3(iemAImpl_idiv_u,a_cBitsWidth,a_Suffix),a_Args) \
2467{ \
2468 /* Note! Skylake leaves all flags alone. */ \
2469 \
2470 /** @todo overflow checks */ \
2471 if (uDivisor != 0) \
2472 { \
2473 /* \
2474 * Convert to unsigned division. \
2475 */ \
2476 RTUINT ## a_cBitsWidth2x ## U Dividend; \
2477 a_fnLoad(Dividend); \
2478 bool const fSignedDividend = RT_BOOL(Dividend.s.Hi & RT_BIT_64(a_cBitsWidth - 1)); \
2479 if (fSignedDividend) \
2480 a_fnNeg(Dividend, a_cBitsWidth2x); \
2481 \
2482 uint ## a_cBitsWidth ## _t uDivisorPositive; \
2483 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2484 uDivisorPositive = uDivisor; \
2485 else \
2486 uDivisorPositive = UINT ## a_cBitsWidth ## _C(0) - uDivisor; \
2487 \
2488 RTUINT ## a_cBitsWidth2x ## U Remainder, Quotient; \
2489 a_fnDivRem(Quotient, Remainder, Dividend, uDivisorPositive); \
2490 \
2491 /* \
2492 * Setup the result, checking for overflows. \
2493 */ \
2494 if (!(uDivisor & RT_BIT_64(a_cBitsWidth - 1))) \
2495 { \
2496 if (!fSignedDividend) \
2497 { \
2498 /* Positive divisor, positive dividend => result positive. */ \
2499 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2500 { \
2501 a_fnStore(Quotient.s.Lo, Remainder.s.Lo); \
2502 if (!a_fIntelFlags) \
2503 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2504 return 0; \
2505 } \
2506 } \
2507 else \
2508 { \
2509 /* Positive divisor, negative dividend => result negative. */ \
2510 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2511 { \
2512 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2513 if (!a_fIntelFlags) \
2514 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2515 return 0; \
2516 } \
2517 } \
2518 } \
2519 else \
2520 { \
2521 if (!fSignedDividend) \
2522 { \
2523 /* Negative divisor, positive dividend => negative quotient, positive remainder. */ \
2524 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= RT_BIT_64(a_cBitsWidth - 1)) \
2525 { \
2526 a_fnStore(UINT ## a_cBitsWidth ## _C(0) - Quotient.s.Lo, Remainder.s.Lo); \
2527 if (!a_fIntelFlags) \
2528 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2529 return 0; \
2530 } \
2531 } \
2532 else \
2533 { \
2534 /* Negative divisor, negative dividend => positive quotient, negative remainder. */ \
2535 if (Quotient.s.Hi == 0 && Quotient.s.Lo <= (uint ## a_cBitsWidth ## _t)INT ## a_cBitsWidth ## _MAX) \
2536 { \
2537 a_fnStore(Quotient.s.Lo, UINT ## a_cBitsWidth ## _C(0) - Remainder.s.Lo); \
2538 if (!a_fIntelFlags) \
2539 *pfEFlags = (*pfEFlags & ~(X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF)) | X86_EFL_AF; \
2540 return 0; \
2541 } \
2542 } \
2543 } \
2544 } \
2545 /* #DE */ \
2546 return -1; \
2547}
2548# define EMIT_IDIV(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem) \
2549 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, RT_NOTHING, 1) \
2550 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _intel, 1) \
2551 EMIT_IDIV_INNER(a_cBitsWidth, a_cBitsWidth2x, a_Args, a_CallArgs, a_fnLoad, a_fnStore, a_fnNeg, a_fnDivRem, _amd, 0)
2552
2553# ifndef DOXYGEN_RUNNING /* this totally confuses doxygen for some reason */
2554EMIT_IDIV(64,128,(uint64_t *puA, uint64_t *puD, uint64_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2555 DIV_LOAD, DIV_STORE, MULDIV_NEG_U128, MULDIV_MODDIV_U128)
2556# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2557EMIT_IDIV(32,64,(uint32_t *puA, uint32_t *puD, uint32_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2558 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2559EMIT_IDIV(16,32,(uint16_t *puA, uint16_t *puD, uint16_t uDivisor, uint32_t *pfEFlags), (puA, puD, uDivisor, pfEFlags),
2560 DIV_LOAD, DIV_STORE, MULDIV_NEG, MULDIV_MODDIV)
2561EMIT_IDIV(8,16,(uint16_t *puAX, uint8_t uDivisor, uint32_t *pfEFlags), (puAX, uDivisor, pfEFlags),
2562 DIV_LOAD_U8, DIV_STORE_U8, MULDIV_NEG, MULDIV_MODDIV)
2563# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2564# endif /* !DOXYGEN_RUNNING */
2565
2566#endif /* (!defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)) && !defined(DOXYGEN_RUNNING) */
2567
2568
2569/*********************************************************************************************************************************
2570* Unary operations. *
2571*********************************************************************************************************************************/
2572#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2573
2574/** @def IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC
2575 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an INC or DEC instruction.
2576 *
2577 * CF is NOT modified for hysterical raisins (allegedly for carrying and
2578 * borrowing in arithmetic loops on intel 8008).
2579 *
2580 * @returns Status bits.
2581 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2582 * @param a_uResult Unsigned result value.
2583 * @param a_uDst The original destination value (for AF calc).
2584 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2585 * @param a_OfMethod 0 for INC-style, 1 for DEC-style.
2586 */
2587#define IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth, a_OfMethod) \
2588 do { \
2589 uint32_t fEflTmp = *(a_pfEFlags); \
2590 fEflTmp &= ~X86_EFL_STATUS_BITS | X86_EFL_CF; \
2591 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2592 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2593 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2594 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2595 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth(a_OfMethod == 0 ? (((a_uDst) ^ RT_BIT_64(a_cBitsWidth - 1)) & (a_uResult)) \
2596 : ((a_uDst) & ((a_uResult) ^ RT_BIT_64(a_cBitsWidth - 1))) ); \
2597 *(a_pfEFlags) = fEflTmp; \
2598 } while (0)
2599
2600/*
2601 * INC
2602 */
2603
2604IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2605{
2606 uint64_t uDst = *puDst;
2607 uint64_t uResult = uDst + 1;
2608 *puDst = uResult;
2609 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 0 /*INC*/);
2610}
2611
2612# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2613
2614IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2615{
2616 uint32_t uDst = *puDst;
2617 uint32_t uResult = uDst + 1;
2618 *puDst = uResult;
2619 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 0 /*INC*/);
2620}
2621
2622
2623IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2624{
2625 uint16_t uDst = *puDst;
2626 uint16_t uResult = uDst + 1;
2627 *puDst = uResult;
2628 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 0 /*INC*/);
2629}
2630
2631IEM_DECL_IMPL_DEF(void, iemAImpl_inc_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2632{
2633 uint8_t uDst = *puDst;
2634 uint8_t uResult = uDst + 1;
2635 *puDst = uResult;
2636 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 0 /*INC*/);
2637}
2638
2639# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2640
2641
2642/*
2643 * DEC
2644 */
2645
2646IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2647{
2648 uint64_t uDst = *puDst;
2649 uint64_t uResult = uDst - 1;
2650 *puDst = uResult;
2651 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 64, 1 /*INC*/);
2652}
2653
2654# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2655
2656IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2657{
2658 uint32_t uDst = *puDst;
2659 uint32_t uResult = uDst - 1;
2660 *puDst = uResult;
2661 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 32, 1 /*INC*/);
2662}
2663
2664
2665IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2666{
2667 uint16_t uDst = *puDst;
2668 uint16_t uResult = uDst - 1;
2669 *puDst = uResult;
2670 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 16, 1 /*INC*/);
2671}
2672
2673
2674IEM_DECL_IMPL_DEF(void, iemAImpl_dec_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2675{
2676 uint8_t uDst = *puDst;
2677 uint8_t uResult = uDst - 1;
2678 *puDst = uResult;
2679 IEM_EFL_UPDATE_STATUS_BITS_FOR_INC_DEC(pfEFlags, uResult, uDst, 8, 1 /*INC*/);
2680}
2681
2682# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2683
2684
2685/*
2686 * NOT
2687 */
2688
2689IEM_DECL_IMPL_DEF(void, iemAImpl_not_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2690{
2691 uint64_t uDst = *puDst;
2692 uint64_t uResult = ~uDst;
2693 *puDst = uResult;
2694 /* EFLAGS are not modified. */
2695 RT_NOREF_PV(pfEFlags);
2696}
2697
2698# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2699
2700IEM_DECL_IMPL_DEF(void, iemAImpl_not_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2701{
2702 uint32_t uDst = *puDst;
2703 uint32_t uResult = ~uDst;
2704 *puDst = uResult;
2705 /* EFLAGS are not modified. */
2706 RT_NOREF_PV(pfEFlags);
2707}
2708
2709IEM_DECL_IMPL_DEF(void, iemAImpl_not_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2710{
2711 uint16_t uDst = *puDst;
2712 uint16_t uResult = ~uDst;
2713 *puDst = uResult;
2714 /* EFLAGS are not modified. */
2715 RT_NOREF_PV(pfEFlags);
2716}
2717
2718IEM_DECL_IMPL_DEF(void, iemAImpl_not_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2719{
2720 uint8_t uDst = *puDst;
2721 uint8_t uResult = ~uDst;
2722 *puDst = uResult;
2723 /* EFLAGS are not modified. */
2724 RT_NOREF_PV(pfEFlags);
2725}
2726
2727# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2728
2729
2730/*
2731 * NEG
2732 */
2733
2734/**
2735 * Updates the status bits (CF, PF, AF, ZF, SF, and OF) for an NEG instruction.
2736 *
2737 * @returns Status bits.
2738 * @param a_pfEFlags Pointer to the 32-bit EFLAGS value to update.
2739 * @param a_uResult Unsigned result value.
2740 * @param a_uDst The original destination value (for AF calc).
2741 * @param a_cBitsWidth The width of the result (8, 16, 32, 64).
2742 */
2743#define IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(a_pfEFlags, a_uResult, a_uDst, a_cBitsWidth) \
2744 do { \
2745 uint32_t fEflTmp = *(a_pfEFlags); \
2746 fEflTmp &= ~X86_EFL_STATUS_BITS & ~X86_EFL_CF; \
2747 fEflTmp |= ((a_uDst) != 0) << X86_EFL_CF_BIT; \
2748 fEflTmp |= g_afParity[(a_uResult) & 0xff]; \
2749 fEflTmp |= ((uint32_t)(a_uResult) ^ (uint32_t)(a_uDst)) & X86_EFL_AF; \
2750 fEflTmp |= X86_EFL_CALC_ZF(a_uResult); \
2751 fEflTmp |= X86_EFL_CALC_SF(a_uResult, a_cBitsWidth); \
2752 fEflTmp |= X86_EFL_GET_OF_ ## a_cBitsWidth((a_uDst) & (a_uResult)); \
2753 *(a_pfEFlags) = fEflTmp; \
2754 } while (0)
2755
2756IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u64,(uint64_t *puDst, uint32_t *pfEFlags))
2757{
2758 uint64_t uDst = *puDst;
2759 uint64_t uResult = (uint64_t)0 - uDst;
2760 *puDst = uResult;
2761 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 64);
2762}
2763
2764# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2765
2766IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u32,(uint32_t *puDst, uint32_t *pfEFlags))
2767{
2768 uint32_t uDst = *puDst;
2769 uint32_t uResult = (uint32_t)0 - uDst;
2770 *puDst = uResult;
2771 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 32);
2772}
2773
2774
2775IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u16,(uint16_t *puDst, uint32_t *pfEFlags))
2776{
2777 uint16_t uDst = *puDst;
2778 uint16_t uResult = (uint16_t)0 - uDst;
2779 *puDst = uResult;
2780 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 16);
2781}
2782
2783
2784IEM_DECL_IMPL_DEF(void, iemAImpl_neg_u8,(uint8_t *puDst, uint32_t *pfEFlags))
2785{
2786 uint8_t uDst = *puDst;
2787 uint8_t uResult = (uint8_t)0 - uDst;
2788 *puDst = uResult;
2789 IEM_EFL_UPDATE_STATUS_BITS_FOR_NEG(pfEFlags, uResult, uDst, 8);
2790}
2791
2792# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
2793
2794/*
2795 * Locked variants.
2796 */
2797
2798/** Emit a function for doing a locked unary operand operation. */
2799# define EMIT_LOCKED_UNARY_OP(a_Mnemonic, a_cBitsWidth) \
2800 IEM_DECL_IMPL_DEF(void, iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth ## _locked,(uint ## a_cBitsWidth ## _t *puDst, \
2801 uint32_t *pfEFlags)) \
2802 { \
2803 uint ## a_cBitsWidth ## _t uOld = ASMAtomicUoReadU ## a_cBitsWidth(puDst); \
2804 uint ## a_cBitsWidth ## _t uTmp; \
2805 uint32_t fEflTmp; \
2806 do \
2807 { \
2808 uTmp = uOld; \
2809 fEflTmp = *pfEFlags; \
2810 iemAImpl_ ## a_Mnemonic ## _u ## a_cBitsWidth(&uTmp, &fEflTmp); \
2811 } while (!ASMAtomicCmpXchgExU ## a_cBitsWidth(puDst, uTmp, uOld, &uOld)); \
2812 *pfEFlags = fEflTmp; \
2813 }
2814
2815EMIT_LOCKED_UNARY_OP(inc, 64)
2816EMIT_LOCKED_UNARY_OP(dec, 64)
2817EMIT_LOCKED_UNARY_OP(not, 64)
2818EMIT_LOCKED_UNARY_OP(neg, 64)
2819# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
2820EMIT_LOCKED_UNARY_OP(inc, 32)
2821EMIT_LOCKED_UNARY_OP(dec, 32)
2822EMIT_LOCKED_UNARY_OP(not, 32)
2823EMIT_LOCKED_UNARY_OP(neg, 32)
2824
2825EMIT_LOCKED_UNARY_OP(inc, 16)
2826EMIT_LOCKED_UNARY_OP(dec, 16)
2827EMIT_LOCKED_UNARY_OP(not, 16)
2828EMIT_LOCKED_UNARY_OP(neg, 16)
2829
2830EMIT_LOCKED_UNARY_OP(inc, 8)
2831EMIT_LOCKED_UNARY_OP(dec, 8)
2832EMIT_LOCKED_UNARY_OP(not, 8)
2833EMIT_LOCKED_UNARY_OP(neg, 8)
2834# endif
2835
2836#endif /* !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY) */
2837
2838
2839/*********************************************************************************************************************************
2840* Shifting and Rotating *
2841*********************************************************************************************************************************/
2842
2843/*
2844 * ROL
2845 */
2846#define EMIT_ROL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2847IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rol_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2848{ \
2849 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2850 if (cShift) \
2851 { \
2852 if (a_cBitsWidth < 32) \
2853 cShift &= a_cBitsWidth - 1; \
2854 a_uType const uDst = *puDst; \
2855 a_uType const uResult = a_fnHlp(uDst, cShift); \
2856 *puDst = uResult; \
2857 \
2858 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
2859 it the same way as for 1 bit shifts. */ \
2860 AssertCompile(X86_EFL_CF_BIT == 0); \
2861 uint32_t fEfl = *pfEFlags; \
2862 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2863 uint32_t const fCarry = (uResult & X86_EFL_CF); \
2864 fEfl |= fCarry; \
2865 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2866 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; \
2867 else /* Intel 10980XE: According to the first sub-shift: */ \
2868 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
2869 *pfEFlags = fEfl; \
2870 } \
2871}
2872
2873#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2874EMIT_ROL(64, uint64_t, RT_NOTHING, 1, ASMRotateLeftU64)
2875#endif
2876EMIT_ROL(64, uint64_t, _intel, 1, ASMRotateLeftU64)
2877EMIT_ROL(64, uint64_t, _amd, 0, ASMRotateLeftU64)
2878
2879#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2880EMIT_ROL(32, uint32_t, RT_NOTHING, 1, ASMRotateLeftU32)
2881#endif
2882EMIT_ROL(32, uint32_t, _intel, 1, ASMRotateLeftU32)
2883EMIT_ROL(32, uint32_t, _amd, 0, ASMRotateLeftU32)
2884
2885DECL_FORCE_INLINE(uint16_t) iemAImpl_rol_u16_hlp(uint16_t uValue, uint8_t cShift)
2886{
2887 return (uValue << cShift) | (uValue >> (16 - cShift));
2888}
2889#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2890EMIT_ROL(16, uint16_t, RT_NOTHING, 1, iemAImpl_rol_u16_hlp)
2891#endif
2892EMIT_ROL(16, uint16_t, _intel, 1, iemAImpl_rol_u16_hlp)
2893EMIT_ROL(16, uint16_t, _amd, 0, iemAImpl_rol_u16_hlp)
2894
2895DECL_FORCE_INLINE(uint8_t) iemAImpl_rol_u8_hlp(uint8_t uValue, uint8_t cShift)
2896{
2897 return (uValue << cShift) | (uValue >> (8 - cShift));
2898}
2899#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2900EMIT_ROL(8, uint8_t, RT_NOTHING, 1, iemAImpl_rol_u8_hlp)
2901#endif
2902EMIT_ROL(8, uint8_t, _intel, 1, iemAImpl_rol_u8_hlp)
2903EMIT_ROL(8, uint8_t, _amd, 0, iemAImpl_rol_u8_hlp)
2904
2905
2906/*
2907 * ROR
2908 */
2909#define EMIT_ROR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags, a_fnHlp) \
2910IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_ror_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2911{ \
2912 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2913 if (cShift) \
2914 { \
2915 if (a_cBitsWidth < 32) \
2916 cShift &= a_cBitsWidth - 1; \
2917 a_uType const uDst = *puDst; \
2918 a_uType const uResult = a_fnHlp(uDst, cShift); \
2919 *puDst = uResult; \
2920 \
2921 /* Calc EFLAGS: */ \
2922 AssertCompile(X86_EFL_CF_BIT == 0); \
2923 uint32_t fEfl = *pfEFlags; \
2924 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2925 uint32_t const fCarry = (uResult >> ((a_cBitsWidth) - 1)) & X86_EFL_CF; \
2926 fEfl |= fCarry; \
2927 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2928 fEfl |= (((uResult >> ((a_cBitsWidth) - 2)) ^ fCarry) & 1) << X86_EFL_OF_BIT; \
2929 else /* Intel 10980XE: According to the first sub-shift: */ \
2930 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << (a_cBitsWidth - 1))); \
2931 *pfEFlags = fEfl; \
2932 } \
2933}
2934
2935#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
2936EMIT_ROR(64, uint64_t, RT_NOTHING, 1, ASMRotateRightU64)
2937#endif
2938EMIT_ROR(64, uint64_t, _intel, 1, ASMRotateRightU64)
2939EMIT_ROR(64, uint64_t, _amd, 0, ASMRotateRightU64)
2940
2941#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2942EMIT_ROR(32, uint32_t, RT_NOTHING, 1, ASMRotateRightU32)
2943#endif
2944EMIT_ROR(32, uint32_t, _intel, 1, ASMRotateRightU32)
2945EMIT_ROR(32, uint32_t, _amd, 0, ASMRotateRightU32)
2946
2947DECL_FORCE_INLINE(uint16_t) iemAImpl_ror_u16_hlp(uint16_t uValue, uint8_t cShift)
2948{
2949 return (uValue >> cShift) | (uValue << (16 - cShift));
2950}
2951#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2952EMIT_ROR(16, uint16_t, RT_NOTHING, 1, iemAImpl_ror_u16_hlp)
2953#endif
2954EMIT_ROR(16, uint16_t, _intel, 1, iemAImpl_ror_u16_hlp)
2955EMIT_ROR(16, uint16_t, _amd, 0, iemAImpl_ror_u16_hlp)
2956
2957DECL_FORCE_INLINE(uint8_t) iemAImpl_ror_u8_hlp(uint8_t uValue, uint8_t cShift)
2958{
2959 return (uValue >> cShift) | (uValue << (8 - cShift));
2960}
2961#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
2962EMIT_ROR(8, uint8_t, RT_NOTHING, 1, iemAImpl_ror_u8_hlp)
2963#endif
2964EMIT_ROR(8, uint8_t, _intel, 1, iemAImpl_ror_u8_hlp)
2965EMIT_ROR(8, uint8_t, _amd, 0, iemAImpl_ror_u8_hlp)
2966
2967
2968/*
2969 * RCL
2970 */
2971#define EMIT_RCL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
2972IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
2973{ \
2974 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
2975 if (a_cBitsWidth < 32 && a_fIntelFlags) \
2976 cShift %= a_cBitsWidth + 1; \
2977 if (cShift) \
2978 { \
2979 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
2980 cShift %= a_cBitsWidth + 1; \
2981 a_uType const uDst = *puDst; \
2982 a_uType uResult = uDst << cShift; \
2983 if (cShift > 1) \
2984 uResult |= uDst >> (a_cBitsWidth + 1 - cShift); \
2985 \
2986 AssertCompile(X86_EFL_CF_BIT == 0); \
2987 uint32_t fEfl = *pfEFlags; \
2988 uint32_t fInCarry = fEfl & X86_EFL_CF; \
2989 uResult |= (a_uType)fInCarry << (cShift - 1); \
2990 \
2991 *puDst = uResult; \
2992 \
2993 /* Calc EFLAGS. */ \
2994 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
2995 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
2996 ? (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF : fInCarry; \
2997 fEfl |= fOutCarry; \
2998 if (!a_fIntelFlags) /* AMD 3990X: According to the last sub-shift: */ \
2999 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fOutCarry) << X86_EFL_OF_BIT; \
3000 else /* Intel 10980XE: According to the first sub-shift: */ \
3001 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3002 *pfEFlags = fEfl; \
3003 } \
3004}
3005
3006#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3007EMIT_RCL(64, uint64_t, RT_NOTHING, 1)
3008#endif
3009EMIT_RCL(64, uint64_t, _intel, 1)
3010EMIT_RCL(64, uint64_t, _amd, 0)
3011
3012#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3013EMIT_RCL(32, uint32_t, RT_NOTHING, 1)
3014#endif
3015EMIT_RCL(32, uint32_t, _intel, 1)
3016EMIT_RCL(32, uint32_t, _amd, 0)
3017
3018#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3019EMIT_RCL(16, uint16_t, RT_NOTHING, 1)
3020#endif
3021EMIT_RCL(16, uint16_t, _intel, 1)
3022EMIT_RCL(16, uint16_t, _amd, 0)
3023
3024#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3025EMIT_RCL(8, uint8_t, RT_NOTHING, 1)
3026#endif
3027EMIT_RCL(8, uint8_t, _intel, 1)
3028EMIT_RCL(8, uint8_t, _amd, 0)
3029
3030
3031/*
3032 * RCR
3033 */
3034#define EMIT_RCR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3035IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_rcr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3036{ \
3037 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3038 if (a_cBitsWidth < 32 && a_fIntelFlags) \
3039 cShift %= a_cBitsWidth + 1; \
3040 if (cShift) \
3041 { \
3042 if (a_cBitsWidth < 32 && !a_fIntelFlags) \
3043 cShift %= a_cBitsWidth + 1; \
3044 a_uType const uDst = *puDst; \
3045 a_uType uResult = uDst >> cShift; \
3046 if (cShift > 1) \
3047 uResult |= uDst << (a_cBitsWidth + 1 - cShift); \
3048 \
3049 AssertCompile(X86_EFL_CF_BIT == 0); \
3050 uint32_t fEfl = *pfEFlags; \
3051 uint32_t fInCarry = fEfl & X86_EFL_CF; \
3052 uResult |= (a_uType)fInCarry << (a_cBitsWidth - cShift); \
3053 *puDst = uResult; \
3054 \
3055 /* Calc EFLAGS. The OF bit is undefined if cShift > 1, we implement \
3056 it the same way as for 1 bit shifts. */ \
3057 fEfl &= ~(X86_EFL_CF | X86_EFL_OF); \
3058 uint32_t const fOutCarry = a_cBitsWidth >= 32 || a_fIntelFlags || cShift \
3059 ? (uDst >> (cShift - 1)) & X86_EFL_CF : fInCarry; \
3060 fEfl |= fOutCarry; \
3061 if (!a_fIntelFlags) /* AMD 3990X: XOR two most signficant bits of the result: */ \
3062 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uResult ^ (uResult << 1)); \
3063 else /* Intel 10980XE: same as AMD, but only for the first sub-shift: */ \
3064 fEfl |= (fInCarry ^ (uint32_t)(uDst >> (a_cBitsWidth - 1))) << X86_EFL_OF_BIT; \
3065 *pfEFlags = fEfl; \
3066 } \
3067}
3068
3069#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3070EMIT_RCR(64, uint64_t, RT_NOTHING, 1)
3071#endif
3072EMIT_RCR(64, uint64_t, _intel, 1)
3073EMIT_RCR(64, uint64_t, _amd, 0)
3074
3075#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3076EMIT_RCR(32, uint32_t, RT_NOTHING, 1)
3077#endif
3078EMIT_RCR(32, uint32_t, _intel, 1)
3079EMIT_RCR(32, uint32_t, _amd, 0)
3080
3081#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3082EMIT_RCR(16, uint16_t, RT_NOTHING, 1)
3083#endif
3084EMIT_RCR(16, uint16_t, _intel, 1)
3085EMIT_RCR(16, uint16_t, _amd, 0)
3086
3087#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3088EMIT_RCR(8, uint8_t, RT_NOTHING, 1)
3089#endif
3090EMIT_RCR(8, uint8_t, _intel, 1)
3091EMIT_RCR(8, uint8_t, _amd, 0)
3092
3093
3094/*
3095 * SHL
3096 */
3097#define EMIT_SHL(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3098IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shl_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3099{ \
3100 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3101 if (cShift) \
3102 { \
3103 a_uType const uDst = *puDst; \
3104 a_uType uResult = uDst << cShift; \
3105 *puDst = uResult; \
3106 \
3107 /* Calc EFLAGS. */ \
3108 AssertCompile(X86_EFL_CF_BIT == 0); \
3109 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3110 uint32_t fCarry = (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; \
3111 fEfl |= fCarry; \
3112 if (!a_fIntelFlags) \
3113 fEfl |= ((uResult >> (a_cBitsWidth - 1)) ^ fCarry) << X86_EFL_OF_BIT; /* AMD 3990X: Last shift result. */ \
3114 else \
3115 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); /* Intel 10980XE: First shift result. */ \
3116 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3117 fEfl |= X86_EFL_CALC_ZF(uResult); \
3118 fEfl |= g_afParity[uResult & 0xff]; \
3119 if (!a_fIntelFlags) \
3120 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3121 *pfEFlags = fEfl; \
3122 } \
3123}
3124
3125#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3126EMIT_SHL(64, uint64_t, RT_NOTHING, 1)
3127#endif
3128EMIT_SHL(64, uint64_t, _intel, 1)
3129EMIT_SHL(64, uint64_t, _amd, 0)
3130
3131#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3132EMIT_SHL(32, uint32_t, RT_NOTHING, 1)
3133#endif
3134EMIT_SHL(32, uint32_t, _intel, 1)
3135EMIT_SHL(32, uint32_t, _amd, 0)
3136
3137#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3138EMIT_SHL(16, uint16_t, RT_NOTHING, 1)
3139#endif
3140EMIT_SHL(16, uint16_t, _intel, 1)
3141EMIT_SHL(16, uint16_t, _amd, 0)
3142
3143#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3144EMIT_SHL(8, uint8_t, RT_NOTHING, 1)
3145#endif
3146EMIT_SHL(8, uint8_t, _intel, 1)
3147EMIT_SHL(8, uint8_t, _amd, 0)
3148
3149
3150/*
3151 * SHR
3152 */
3153#define EMIT_SHR(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3154IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shr_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3155{ \
3156 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3157 if (cShift) \
3158 { \
3159 a_uType const uDst = *puDst; \
3160 a_uType uResult = uDst >> cShift; \
3161 *puDst = uResult; \
3162 \
3163 /* Calc EFLAGS. */ \
3164 AssertCompile(X86_EFL_CF_BIT == 0); \
3165 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3166 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3167 if (a_fIntelFlags || cShift == 1) /* AMD 3990x does what intel documents; Intel 10980XE does this for all shift counts. */ \
3168 fEfl |= (uDst >> (a_cBitsWidth - 1)) << X86_EFL_OF_BIT; \
3169 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3170 fEfl |= X86_EFL_CALC_ZF(uResult); \
3171 fEfl |= g_afParity[uResult & 0xff]; \
3172 if (!a_fIntelFlags) \
3173 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3174 *pfEFlags = fEfl; \
3175 } \
3176}
3177
3178#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3179EMIT_SHR(64, uint64_t, RT_NOTHING, 1)
3180#endif
3181EMIT_SHR(64, uint64_t, _intel, 1)
3182EMIT_SHR(64, uint64_t, _amd, 0)
3183
3184#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3185EMIT_SHR(32, uint32_t, RT_NOTHING, 1)
3186#endif
3187EMIT_SHR(32, uint32_t, _intel, 1)
3188EMIT_SHR(32, uint32_t, _amd, 0)
3189
3190#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3191EMIT_SHR(16, uint16_t, RT_NOTHING, 1)
3192#endif
3193EMIT_SHR(16, uint16_t, _intel, 1)
3194EMIT_SHR(16, uint16_t, _amd, 0)
3195
3196#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3197EMIT_SHR(8, uint8_t, RT_NOTHING, 1)
3198#endif
3199EMIT_SHR(8, uint8_t, _intel, 1)
3200EMIT_SHR(8, uint8_t, _amd, 0)
3201
3202
3203/*
3204 * SAR
3205 */
3206#define EMIT_SAR(a_cBitsWidth, a_uType, a_iType, a_Suffix, a_fIntelFlags) \
3207IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sar_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, uint8_t cShift, uint32_t *pfEFlags)) \
3208{ \
3209 cShift &= a_cBitsWidth >= 32 ? a_cBitsWidth - 1 : 31; \
3210 if (cShift) \
3211 { \
3212 a_iType const iDst = (a_iType)*puDst; \
3213 a_uType uResult = iDst >> cShift; \
3214 *puDst = uResult; \
3215 \
3216 /* Calc EFLAGS. \
3217 Note! The OF flag is always zero because the result never differs from the input. */ \
3218 AssertCompile(X86_EFL_CF_BIT == 0); \
3219 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3220 fEfl |= (iDst >> (cShift - 1)) & X86_EFL_CF; \
3221 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3222 fEfl |= X86_EFL_CALC_ZF(uResult); \
3223 fEfl |= g_afParity[uResult & 0xff]; \
3224 if (!a_fIntelFlags) \
3225 fEfl |= X86_EFL_AF; /* AMD 3990x sets it unconditionally, Intel 10980XE does the oposite */ \
3226 *pfEFlags = fEfl; \
3227 } \
3228}
3229
3230#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3231EMIT_SAR(64, uint64_t, int64_t, RT_NOTHING, 1)
3232#endif
3233EMIT_SAR(64, uint64_t, int64_t, _intel, 1)
3234EMIT_SAR(64, uint64_t, int64_t, _amd, 0)
3235
3236#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3237EMIT_SAR(32, uint32_t, int32_t, RT_NOTHING, 1)
3238#endif
3239EMIT_SAR(32, uint32_t, int32_t, _intel, 1)
3240EMIT_SAR(32, uint32_t, int32_t, _amd, 0)
3241
3242#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3243EMIT_SAR(16, uint16_t, int16_t, RT_NOTHING, 1)
3244#endif
3245EMIT_SAR(16, uint16_t, int16_t, _intel, 1)
3246EMIT_SAR(16, uint16_t, int16_t, _amd, 0)
3247
3248#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3249EMIT_SAR(8, uint8_t, int8_t, RT_NOTHING, 1)
3250#endif
3251EMIT_SAR(8, uint8_t, int8_t, _intel, 1)
3252EMIT_SAR(8, uint8_t, int8_t, _amd, 0)
3253
3254
3255/*
3256 * SHLD
3257 *
3258 * - CF is the last bit shifted out of puDst.
3259 * - AF is always cleared by Intel 10980XE.
3260 * - AF is always set by AMD 3990X.
3261 * - OF is set according to the first shift on Intel 10980XE, it seems.
3262 * - OF is set according to the last sub-shift on AMD 3990X.
3263 * - ZF, SF and PF are calculated according to the result by both vendors.
3264 *
3265 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3266 * pick either the source register or the destination register for input bits
3267 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3268 * intel has changed behaviour here several times. We implement what current
3269 * skylake based does for now, we can extend this later as needed.
3270 */
3271#define EMIT_SHLD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3272IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shld_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, \
3273 uint32_t *pfEFlags)) \
3274{ \
3275 cShift &= a_cBitsWidth - 1; \
3276 if (cShift) \
3277 { \
3278 a_uType const uDst = *puDst; \
3279 a_uType uResult = uDst << cShift; \
3280 uResult |= uSrc >> (a_cBitsWidth - cShift); \
3281 *puDst = uResult; \
3282 \
3283 /* CALC EFLAGS: */ \
3284 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3285 if (a_fIntelFlags) \
3286 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3287 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uDst << 1)); \
3288 else \
3289 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3290 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uDst << (cShift - 1)) ^ uResult); \
3291 fEfl |= X86_EFL_AF; \
3292 } \
3293 AssertCompile(X86_EFL_CF_BIT == 0); \
3294 fEfl |= (uDst >> (a_cBitsWidth - cShift)) & X86_EFL_CF; /* CF = last bit shifted out */ \
3295 fEfl |= g_afParity[uResult & 0xff]; \
3296 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3297 fEfl |= X86_EFL_CALC_ZF(uResult); \
3298 *pfEFlags = fEfl; \
3299 } \
3300}
3301
3302#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3303EMIT_SHLD(64, uint64_t, RT_NOTHING, 1)
3304#endif
3305EMIT_SHLD(64, uint64_t, _intel, 1)
3306EMIT_SHLD(64, uint64_t, _amd, 0)
3307
3308#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3309EMIT_SHLD(32, uint32_t, RT_NOTHING, 1)
3310#endif
3311EMIT_SHLD(32, uint32_t, _intel, 1)
3312EMIT_SHLD(32, uint32_t, _amd, 0)
3313
3314#define EMIT_SHLD_16(a_Suffix, a_fIntelFlags) \
3315IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shld_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3316{ \
3317 cShift &= 31; \
3318 if (cShift) \
3319 { \
3320 uint16_t const uDst = *puDst; \
3321 uint64_t const uTmp = a_fIntelFlags \
3322 ? ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uDst \
3323 : ((uint64_t)uDst << 32) | ((uint32_t)uSrc << 16) | uSrc; \
3324 uint16_t const uResult = (uint16_t)((uTmp << cShift) >> 32); \
3325 *puDst = uResult; \
3326 \
3327 /* CALC EFLAGS: */ \
3328 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3329 AssertCompile(X86_EFL_CF_BIT == 0); \
3330 if (a_fIntelFlags) \
3331 { \
3332 fEfl |= (uTmp >> (48 - cShift)) & X86_EFL_CF; /* CF = last bit shifted out of the combined operand */ \
3333 /* Intel 6700K & 10980XE: OF is et according to the first shift. AF always cleared. */ \
3334 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uDst << 1)); \
3335 } \
3336 else \
3337 { \
3338 /* AMD 3990X: OF is set according to last shift, with some weirdness. AF always set. CF = last bit shifted out of uDst. */ \
3339 if (cShift < 16) \
3340 { \
3341 fEfl |= (uDst >> (16 - cShift)) & X86_EFL_CF; \
3342 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ uResult); \
3343 } \
3344 else \
3345 { \
3346 if (cShift == 16) \
3347 fEfl |= uDst & X86_EFL_CF; \
3348 fEfl |= X86_EFL_GET_OF_16((uDst << (cShift - 1)) ^ 0); \
3349 } \
3350 fEfl |= X86_EFL_AF; \
3351 } \
3352 fEfl |= g_afParity[uResult & 0xff]; \
3353 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3354 fEfl |= X86_EFL_CALC_ZF(uResult); \
3355 *pfEFlags = fEfl; \
3356 } \
3357}
3358
3359#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3360EMIT_SHLD_16(RT_NOTHING, 1)
3361#endif
3362EMIT_SHLD_16(_intel, 1)
3363EMIT_SHLD_16(_amd, 0)
3364
3365
3366/*
3367 * SHRD
3368 *
3369 * EFLAGS behaviour seems to be the same as with SHLD:
3370 * - CF is the last bit shifted out of puDst.
3371 * - AF is always cleared by Intel 10980XE.
3372 * - AF is always set by AMD 3990X.
3373 * - OF is set according to the first shift on Intel 10980XE, it seems.
3374 * - OF is set according to the last sub-shift on AMD 3990X.
3375 * - ZF, SF and PF are calculated according to the result by both vendors.
3376 *
3377 * For 16-bit shifts the count mask isn't 15, but 31, and the CPU will
3378 * pick either the source register or the destination register for input bits
3379 * when going beyond 16. According to https://www.sandpile.org/x86/flags.htm
3380 * intel has changed behaviour here several times. We implement what current
3381 * skylake based does for now, we can extend this later as needed.
3382 */
3383#define EMIT_SHRD(a_cBitsWidth, a_uType, a_Suffix, a_fIntelFlags) \
3384IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrd_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3385{ \
3386 cShift &= a_cBitsWidth - 1; \
3387 if (cShift) \
3388 { \
3389 a_uType const uDst = *puDst; \
3390 a_uType uResult = uDst >> cShift; \
3391 uResult |= uSrc << (a_cBitsWidth - cShift); \
3392 *puDst = uResult; \
3393 \
3394 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3395 AssertCompile(X86_EFL_CF_BIT == 0); \
3396 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3397 if (a_fIntelFlags) \
3398 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3399 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ (uSrc << (a_cBitsWidth - 1))); \
3400 else \
3401 { /* AMD 3990X: Set according to last shift. AF always set. */ \
3402 if (cShift > 1) /* Set according to last shift. */ \
3403 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth((uSrc << (a_cBitsWidth - cShift + 1)) ^ uResult); \
3404 else \
3405 fEfl |= X86_EFL_GET_OF_ ## a_cBitsWidth(uDst ^ uResult); \
3406 fEfl |= X86_EFL_AF; \
3407 } \
3408 fEfl |= X86_EFL_CALC_SF(uResult, a_cBitsWidth); \
3409 fEfl |= X86_EFL_CALC_ZF(uResult); \
3410 fEfl |= g_afParity[uResult & 0xff]; \
3411 *pfEFlags = fEfl; \
3412 } \
3413}
3414
3415#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3416EMIT_SHRD(64, uint64_t, RT_NOTHING, 1)
3417#endif
3418EMIT_SHRD(64, uint64_t, _intel, 1)
3419EMIT_SHRD(64, uint64_t, _amd, 0)
3420
3421#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3422EMIT_SHRD(32, uint32_t, RT_NOTHING, 1)
3423#endif
3424EMIT_SHRD(32, uint32_t, _intel, 1)
3425EMIT_SHRD(32, uint32_t, _amd, 0)
3426
3427#define EMIT_SHRD_16(a_Suffix, a_fIntelFlags) \
3428IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_shrd_u16,a_Suffix),(uint16_t *puDst, uint16_t uSrc, uint8_t cShift, uint32_t *pfEFlags)) \
3429{ \
3430 cShift &= 31; \
3431 if (cShift) \
3432 { \
3433 uint16_t const uDst = *puDst; \
3434 uint64_t const uTmp = a_fIntelFlags \
3435 ? uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uDst << 32) \
3436 : uDst | ((uint32_t)uSrc << 16) | ((uint64_t)uSrc << 32); \
3437 uint16_t const uResult = (uint16_t)(uTmp >> cShift); \
3438 *puDst = uResult; \
3439 \
3440 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS; \
3441 AssertCompile(X86_EFL_CF_BIT == 0); \
3442 if (a_fIntelFlags) \
3443 { \
3444 /* Intel 10980XE: The CF is the last shifted out of the combined uTmp operand. */ \
3445 fEfl |= (uTmp >> (cShift - 1)) & X86_EFL_CF; \
3446 /* Intel 6700K & 10980XE: Set according to the first shift. AF always cleared. */ \
3447 fEfl |= X86_EFL_GET_OF_16(uDst ^ (uSrc << 15)); \
3448 } \
3449 else \
3450 { \
3451 /* AMD 3990X: CF flag seems to be last bit shifted out of uDst, not the combined uSrc:uSrc:uDst operand. */ \
3452 fEfl |= (uDst >> (cShift - 1)) & X86_EFL_CF; \
3453 /* AMD 3990X: Set according to last shift. AF always set. */ \
3454 if (cShift > 1) /* Set according to last shift. */ \
3455 fEfl |= X86_EFL_GET_OF_16((uint16_t)(uTmp >> (cShift - 1)) ^ uResult); \
3456 else \
3457 fEfl |= X86_EFL_GET_OF_16(uDst ^ uResult); \
3458 fEfl |= X86_EFL_AF; \
3459 } \
3460 fEfl |= X86_EFL_CALC_SF(uResult, 16); \
3461 fEfl |= X86_EFL_CALC_ZF(uResult); \
3462 fEfl |= g_afParity[uResult & 0xff]; \
3463 *pfEFlags = fEfl; \
3464 } \
3465}
3466
3467#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3468EMIT_SHRD_16(RT_NOTHING, 1)
3469#endif
3470EMIT_SHRD_16(_intel, 1)
3471EMIT_SHRD_16(_amd, 0)
3472
3473
3474/*
3475 * RORX (BMI2)
3476 */
3477#define EMIT_RORX(a_cBitsWidth, a_uType, a_fnHlp) \
3478IEM_DECL_IMPL_DEF(void, RT_CONCAT(iemAImpl_rorx_u,a_cBitsWidth),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3479{ \
3480 *puDst = a_fnHlp(uSrc, cShift & (a_cBitsWidth - 1)); \
3481}
3482
3483#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3484EMIT_RORX(64, uint64_t, ASMRotateRightU64)
3485#endif
3486#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3487EMIT_RORX(32, uint32_t, ASMRotateRightU32)
3488#endif
3489
3490
3491/*
3492 * SHLX (BMI2)
3493 */
3494#define EMIT_SHLX(a_cBitsWidth, a_uType, a_Suffix) \
3495IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shlx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3496{ \
3497 cShift &= a_cBitsWidth - 1; \
3498 *puDst = uSrc << cShift; \
3499}
3500
3501#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3502EMIT_SHLX(64, uint64_t, RT_NOTHING)
3503EMIT_SHLX(64, uint64_t, _fallback)
3504#endif
3505#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3506EMIT_SHLX(32, uint32_t, RT_NOTHING)
3507EMIT_SHLX(32, uint32_t, _fallback)
3508#endif
3509
3510
3511/*
3512 * SHRX (BMI2)
3513 */
3514#define EMIT_SHRX(a_cBitsWidth, a_uType, a_Suffix) \
3515IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_shrx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3516{ \
3517 cShift &= a_cBitsWidth - 1; \
3518 *puDst = uSrc >> cShift; \
3519}
3520
3521#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3522EMIT_SHRX(64, uint64_t, RT_NOTHING)
3523EMIT_SHRX(64, uint64_t, _fallback)
3524#endif
3525#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3526EMIT_SHRX(32, uint32_t, RT_NOTHING)
3527EMIT_SHRX(32, uint32_t, _fallback)
3528#endif
3529
3530
3531/*
3532 * SARX (BMI2)
3533 */
3534#define EMIT_SARX(a_cBitsWidth, a_uType, a_iType, a_Suffix) \
3535IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_sarx_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType cShift)) \
3536{ \
3537 cShift &= a_cBitsWidth - 1; \
3538 *puDst = (a_iType)uSrc >> cShift; \
3539}
3540
3541#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3542EMIT_SARX(64, uint64_t, int64_t, RT_NOTHING)
3543EMIT_SARX(64, uint64_t, int64_t, _fallback)
3544#endif
3545#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3546EMIT_SARX(32, uint32_t, int32_t, RT_NOTHING)
3547EMIT_SARX(32, uint32_t, int32_t, _fallback)
3548#endif
3549
3550
3551/*
3552 * PDEP (BMI2)
3553 */
3554#define EMIT_PDEP(a_cBitsWidth, a_uType, a_Suffix) \
3555IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pdep_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3556{ \
3557 a_uType uResult = 0; \
3558 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3559 if (fMask & ((a_uType)1 << iMaskBit)) \
3560 { \
3561 uResult |= ((uSrc >> iBit) & 1) << iMaskBit; \
3562 iBit++; \
3563 } \
3564 *puDst = uResult; \
3565}
3566
3567#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3568EMIT_PDEP(64, uint64_t, RT_NOTHING)
3569#endif
3570EMIT_PDEP(64, uint64_t, _fallback)
3571#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3572EMIT_PDEP(32, uint32_t, RT_NOTHING)
3573#endif
3574EMIT_PDEP(32, uint32_t, _fallback)
3575
3576/*
3577 * PEXT (BMI2)
3578 */
3579#define EMIT_PEXT(a_cBitsWidth, a_uType, a_Suffix) \
3580IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_pext_u,a_cBitsWidth,a_Suffix),(a_uType *puDst, a_uType uSrc, a_uType fMask)) \
3581{ \
3582 a_uType uResult = 0; \
3583 for (unsigned iMaskBit = 0, iBit = 0; iMaskBit < a_cBitsWidth; iMaskBit++) \
3584 if (fMask & ((a_uType)1 << iMaskBit)) \
3585 { \
3586 uResult |= ((uSrc >> iMaskBit) & 1) << iBit; \
3587 iBit++; \
3588 } \
3589 *puDst = uResult; \
3590}
3591
3592#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3593EMIT_PEXT(64, uint64_t, RT_NOTHING)
3594#endif
3595EMIT_PEXT(64, uint64_t, _fallback)
3596#if (!defined(RT_ARCH_X86) && !defined(RT_ARCH_AMD64)) || defined(IEM_WITHOUT_ASSEMBLY)
3597EMIT_PEXT(32, uint32_t, RT_NOTHING)
3598#endif
3599EMIT_PEXT(32, uint32_t, _fallback)
3600
3601
3602#if !defined(RT_ARCH_AMD64) || defined(IEM_WITHOUT_ASSEMBLY)
3603
3604# if !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY)
3605/*
3606 * BSWAP
3607 */
3608
3609IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u64,(uint64_t *puDst))
3610{
3611 *puDst = ASMByteSwapU64(*puDst);
3612}
3613
3614
3615IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u32,(uint32_t *puDst))
3616{
3617 *puDst = ASMByteSwapU32(*puDst);
3618}
3619
3620
3621/* Note! undocument, so 32-bit arg */
3622IEM_DECL_IMPL_DEF(void, iemAImpl_bswap_u16,(uint32_t *puDst))
3623{
3624#if 0
3625 *(uint16_t *)puDst = ASMByteSwapU16(*(uint16_t *)puDst);
3626#else
3627 /* This is the behaviour AMD 3990x (64-bit mode): */
3628 *(uint16_t *)puDst = 0;
3629#endif
3630}
3631
3632# endif /* !defined(RT_ARCH_X86) || defined(IEM_WITHOUT_ASSEMBLY) */
3633
3634
3635
3636# if defined(IEM_WITHOUT_ASSEMBLY)
3637
3638/*
3639 * LFENCE, SFENCE & MFENCE.
3640 */
3641
3642IEM_DECL_IMPL_DEF(void, iemAImpl_lfence,(void))
3643{
3644 ASMReadFence();
3645}
3646
3647
3648IEM_DECL_IMPL_DEF(void, iemAImpl_sfence,(void))
3649{
3650 ASMWriteFence();
3651}
3652
3653
3654IEM_DECL_IMPL_DEF(void, iemAImpl_mfence,(void))
3655{
3656 ASMMemoryFence();
3657}
3658
3659
3660# ifndef RT_ARCH_ARM64
3661IEM_DECL_IMPL_DEF(void, iemAImpl_alt_mem_fence,(void))
3662{
3663 ASMMemoryFence();
3664}
3665# endif
3666
3667# endif
3668
3669#endif /* !RT_ARCH_AMD64 || IEM_WITHOUT_ASSEMBLY */
3670
3671
3672IEM_DECL_IMPL_DEF(void, iemAImpl_arpl,(uint16_t *pu16Dst, uint16_t u16Src, uint32_t *pfEFlags))
3673{
3674 if ((*pu16Dst & X86_SEL_RPL) < (u16Src & X86_SEL_RPL))
3675 {
3676 *pu16Dst &= X86_SEL_MASK_OFF_RPL;
3677 *pu16Dst |= u16Src & X86_SEL_RPL;
3678
3679 *pfEFlags |= X86_EFL_ZF;
3680 }
3681 else
3682 *pfEFlags &= ~X86_EFL_ZF;
3683}
3684
3685
3686#if defined(IEM_WITHOUT_ASSEMBLY)
3687
3688/*********************************************************************************************************************************
3689* x87 FPU Loads *
3690*********************************************************************************************************************************/
3691
3692IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r32,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT32U pr32Val))
3693{
3694 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3695 if (RTFLOAT32U_IS_NORMAL(pr32Val))
3696 {
3697 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3698 pFpuRes->r80Result.sj64.fInteger = 1;
3699 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3700 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3701 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3702 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3703 }
3704 else if (RTFLOAT32U_IS_ZERO(pr32Val))
3705 {
3706 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3707 pFpuRes->r80Result.s.uExponent = 0;
3708 pFpuRes->r80Result.s.uMantissa = 0;
3709 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3710 }
3711 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
3712 {
3713 /* Subnormal values gets normalized. */
3714 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3715 pFpuRes->r80Result.sj64.fInteger = 1;
3716 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
3717 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3718 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
3719 pFpuRes->r80Result.sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3720 pFpuRes->FSW |= X86_FSW_DE;
3721 if (!(pFpuState->FCW & X86_FCW_DM))
3722 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3723 }
3724 else if (RTFLOAT32U_IS_INF(pr32Val))
3725 {
3726 pFpuRes->r80Result.s.fSign = pr32Val->s.fSign;
3727 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3728 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3729 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3730 }
3731 else
3732 {
3733 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3734 Assert(RTFLOAT32U_IS_NAN(pr32Val));
3735 pFpuRes->r80Result.sj64.fSign = pr32Val->s.fSign;
3736 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3737 pFpuRes->r80Result.sj64.fInteger = 1;
3738 pFpuRes->r80Result.sj64.uFraction = (uint64_t)pr32Val->s.uFraction
3739 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
3740 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
3741 {
3742 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3743 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3744 pFpuRes->FSW |= X86_FSW_IE;
3745
3746 if (!(pFpuState->FCW & X86_FCW_IM))
3747 {
3748 /* The value is not pushed. */
3749 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3750 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3751 pFpuRes->r80Result.au64[0] = 0;
3752 pFpuRes->r80Result.au16[4] = 0;
3753 }
3754 }
3755 else
3756 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3757 }
3758}
3759
3760
3761IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r64,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT64U pr64Val))
3762{
3763 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3764 if (RTFLOAT64U_IS_NORMAL(pr64Val))
3765 {
3766 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3767 pFpuRes->r80Result.sj64.fInteger = 1;
3768 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3769 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
3770 Assert(RTFLOAT80U_IS_NORMAL(&pFpuRes->r80Result));
3771 }
3772 else if (RTFLOAT64U_IS_ZERO(pr64Val))
3773 {
3774 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3775 pFpuRes->r80Result.s.uExponent = 0;
3776 pFpuRes->r80Result.s.uMantissa = 0;
3777 Assert(RTFLOAT80U_IS_ZERO(&pFpuRes->r80Result));
3778 }
3779 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
3780 {
3781 /* Subnormal values gets normalized. */
3782 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3783 pFpuRes->r80Result.sj64.fInteger = 1;
3784 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
3785 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction
3786 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
3787 pFpuRes->r80Result.sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
3788 pFpuRes->FSW |= X86_FSW_DE;
3789 if (!(pFpuState->FCW & X86_FCW_DM))
3790 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B; /* The value is still pushed. */
3791 }
3792 else if (RTFLOAT64U_IS_INF(pr64Val))
3793 {
3794 pFpuRes->r80Result.s.fSign = pr64Val->s.fSign;
3795 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_MAX;
3796 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
3797 Assert(RTFLOAT80U_IS_INF(&pFpuRes->r80Result));
3798 }
3799 else
3800 {
3801 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
3802 Assert(RTFLOAT64U_IS_NAN(pr64Val));
3803 pFpuRes->r80Result.sj64.fSign = pr64Val->s.fSign;
3804 pFpuRes->r80Result.sj64.uExponent = RTFLOAT80U_EXP_MAX;
3805 pFpuRes->r80Result.sj64.fInteger = 1;
3806 pFpuRes->r80Result.sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
3807 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
3808 {
3809 pFpuRes->r80Result.sj64.uFraction |= RT_BIT_64(62); /* make quiet */
3810 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3811 pFpuRes->FSW |= X86_FSW_IE;
3812
3813 if (!(pFpuState->FCW & X86_FCW_IM))
3814 {
3815 /* The value is not pushed. */
3816 pFpuRes->FSW &= ~X86_FSW_TOP_MASK;
3817 pFpuRes->FSW |= X86_FSW_ES | X86_FSW_B;
3818 pFpuRes->r80Result.au64[0] = 0;
3819 pFpuRes->r80Result.au16[4] = 0;
3820 }
3821 }
3822 else
3823 Assert(RTFLOAT80U_IS_QUIET_NAN(&pFpuRes->r80Result));
3824 }
3825}
3826
3827
3828IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
3829{
3830 pFpuRes->r80Result.au64[0] = pr80Val->au64[0];
3831 pFpuRes->r80Result.au16[4] = pr80Val->au16[4];
3832 /* Raises no exceptions. */
3833 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3834}
3835
3836
3837IEM_DECL_IMPL_DEF(void, iemAImpl_fld1,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3838{
3839 pFpuRes->r80Result.sj64.fSign = 0;
3840 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3841 pFpuRes->r80Result.sj64.fInteger = 1;
3842 pFpuRes->r80Result.sj64.uFraction = 0;
3843
3844 /*
3845 * FPU status word:
3846 * - TOP is irrelevant, but we must match x86 assembly version.
3847 * - C1 is always cleared as we don't have any stack overflows.
3848 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
3849 */
3850 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
3851}
3852
3853
3854IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2e,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3855{
3856 pFpuRes->r80Result.sj64.fSign = 0;
3857 pFpuRes->r80Result.sj64.uExponent = 0 + 16383;
3858 pFpuRes->r80Result.sj64.fInteger = 1;
3859 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3860 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3861 ? UINT64_C(0x38aa3b295c17f0bc) : UINT64_C(0x38aa3b295c17f0bb);
3862 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3863}
3864
3865
3866IEM_DECL_IMPL_DEF(void, iemAImpl_fldl2t,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3867{
3868 pFpuRes->r80Result.sj64.fSign = 0;
3869 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3870 pFpuRes->r80Result.sj64.fInteger = 1;
3871 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) != X86_FCW_RC_UP
3872 ? UINT64_C(0x549a784bcd1b8afe) : UINT64_C(0x549a784bcd1b8aff);
3873 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3874}
3875
3876
3877IEM_DECL_IMPL_DEF(void, iemAImpl_fldlg2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3878{
3879 pFpuRes->r80Result.sj64.fSign = 0;
3880 pFpuRes->r80Result.sj64.uExponent = -2 + 16383;
3881 pFpuRes->r80Result.sj64.fInteger = 1;
3882 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3883 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3884 ? UINT64_C(0x1a209a84fbcff799) : UINT64_C(0x1a209a84fbcff798);
3885 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3886}
3887
3888
3889IEM_DECL_IMPL_DEF(void, iemAImpl_fldln2,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3890{
3891 pFpuRes->r80Result.sj64.fSign = 0;
3892 pFpuRes->r80Result.sj64.uExponent = -1 + 16383;
3893 pFpuRes->r80Result.sj64.fInteger = 1;
3894 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3895 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3896 ? UINT64_C(0x317217f7d1cf79ac) : UINT64_C(0x317217f7d1cf79ab);
3897 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3898}
3899
3900
3901IEM_DECL_IMPL_DEF(void, iemAImpl_fldpi,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3902{
3903 pFpuRes->r80Result.sj64.fSign = 0;
3904 pFpuRes->r80Result.sj64.uExponent = 1 + 16383;
3905 pFpuRes->r80Result.sj64.fInteger = 1;
3906 pFpuRes->r80Result.sj64.uFraction = (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
3907 || (pFpuState->FCW & X86_FCW_RC_MASK) == X86_FCW_RC_UP
3908 ? UINT64_C(0x490fdaa22168c235) : UINT64_C(0x490fdaa22168c234);
3909 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3910}
3911
3912
3913IEM_DECL_IMPL_DEF(void, iemAImpl_fldz,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes))
3914{
3915 pFpuRes->r80Result.s.fSign = 0;
3916 pFpuRes->r80Result.s.uExponent = 0;
3917 pFpuRes->r80Result.s.uMantissa = 0;
3918 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3919}
3920
3921#define EMIT_FILD(a_cBits) \
3922IEM_DECL_IMPL_DEF(void, iemAImpl_fild_r80_from_i ## a_cBits,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, \
3923 int ## a_cBits ## _t const *piVal)) \
3924{ \
3925 int ## a_cBits ## _t iVal = *piVal; \
3926 if (iVal == 0) \
3927 { \
3928 pFpuRes->r80Result.s.fSign = 0; \
3929 pFpuRes->r80Result.s.uExponent = 0; \
3930 pFpuRes->r80Result.s.uMantissa = 0; \
3931 } \
3932 else \
3933 { \
3934 if (iVal > 0) \
3935 pFpuRes->r80Result.s.fSign = 0; \
3936 else \
3937 { \
3938 pFpuRes->r80Result.s.fSign = 1; \
3939 iVal = -iVal; \
3940 } \
3941 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
3942 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
3943 pFpuRes->r80Result.s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
3944 } \
3945 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */ \
3946}
3947EMIT_FILD(16)
3948EMIT_FILD(32)
3949EMIT_FILD(64)
3950
3951
3952IEM_DECL_IMPL_DEF(void, iemAImpl_fld_r80_from_d80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTPBCD80U pd80Val))
3953{
3954 pFpuRes->FSW = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); /* see iemAImpl_fld1 */
3955 if ( pd80Val->s.abPairs[0] == 0
3956 && pd80Val->s.abPairs[1] == 0
3957 && pd80Val->s.abPairs[2] == 0
3958 && pd80Val->s.abPairs[3] == 0
3959 && pd80Val->s.abPairs[4] == 0
3960 && pd80Val->s.abPairs[5] == 0
3961 && pd80Val->s.abPairs[6] == 0
3962 && pd80Val->s.abPairs[7] == 0
3963 && pd80Val->s.abPairs[8] == 0)
3964 {
3965 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3966 pFpuRes->r80Result.s.uExponent = 0;
3967 pFpuRes->r80Result.s.uMantissa = 0;
3968 }
3969 else
3970 {
3971 pFpuRes->r80Result.s.fSign = pd80Val->s.fSign;
3972
3973 size_t cPairs = RT_ELEMENTS(pd80Val->s.abPairs);
3974 while (cPairs > 0 && pd80Val->s.abPairs[cPairs - 1] == 0)
3975 cPairs--;
3976
3977 uint64_t uVal = 0;
3978 uint64_t uFactor = 1;
3979 for (size_t iPair = 0; iPair < cPairs; iPair++, uFactor *= 100)
3980 uVal += RTPBCD80U_LO_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor
3981 + RTPBCD80U_HI_DIGIT(pd80Val->s.abPairs[iPair]) * uFactor * 10;
3982
3983 unsigned const cBits = ASMBitLastSetU64(uVal);
3984 pFpuRes->r80Result.s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS;
3985 pFpuRes->r80Result.s.uMantissa = uVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits);
3986 }
3987}
3988
3989
3990/*********************************************************************************************************************************
3991* x87 FPU Stores *
3992*********************************************************************************************************************************/
3993
3994/**
3995 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
3996 *
3997 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
3998 *
3999 * @returns Updated FPU status word value.
4000 * @param fSignIn Incoming sign indicator.
4001 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4002 * @param iExponentIn Unbiased exponent.
4003 * @param fFcw The FPU control word.
4004 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4005 * @param pr32Dst Where to return the output value, if one should be
4006 * returned.
4007 *
4008 * @note Tailored as a helper for iemAImpl_fst_r80_to_r32 right now.
4009 * @note Exact same logic as iemAImpl_StoreNormalR80AsR64.
4010 */
4011static uint16_t iemAImpl_StoreNormalR80AsR32(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4012 uint16_t fFcw, uint16_t fFsw, PRTFLOAT32U pr32Dst)
4013{
4014 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS) - 1; /* 0x7ff */
4015 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4016 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS - 1) /* 0x400 */
4017 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4018 ? fRoundingOffMask
4019 : 0;
4020 uint64_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4021
4022 /*
4023 * Deal with potential overflows/underflows first, optimizing for none.
4024 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4025 */
4026 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT32U_EXP_BIAS;
4027 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT32U_EXP_MAX - 3))
4028 { /* likely? */ }
4029 /*
4030 * Underflow if the exponent zero or negative. This is attempted mapped
4031 * to a subnormal number when possible, with some additional trickery ofc.
4032 */
4033 else if (iExponentOut <= 0)
4034 {
4035 bool const fIsTiny = iExponentOut < 0
4036 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4037 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4038 /* Note! 754-1985 sec 7.4 has something about bias adjust of 192 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4039 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4040
4041 if (iExponentOut <= 0)
4042 {
4043 uMantissaIn = iExponentOut <= -63
4044 ? uMantissaIn != 0
4045 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4046 fRoundedOff = uMantissaIn & fRoundingOffMask;
4047 if (fRoundedOff && fIsTiny)
4048 fFsw |= X86_FSW_UE;
4049 iExponentOut = 0;
4050 }
4051 }
4052 /*
4053 * Overflow if at or above max exponent value or if we will reach max
4054 * when rounding. Will return +/-zero or +/-max value depending on
4055 * whether we're rounding or not.
4056 */
4057 else if ( iExponentOut >= RTFLOAT32U_EXP_MAX
4058 || ( iExponentOut == RTFLOAT32U_EXP_MAX - 1
4059 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4060 {
4061 fFsw |= X86_FSW_OE;
4062 if (!(fFcw & X86_FCW_OM))
4063 return fFsw | X86_FSW_ES | X86_FSW_B;
4064 fFsw |= X86_FSW_PE;
4065 if (uRoundingAdd)
4066 fFsw |= X86_FSW_C1;
4067 if (!(fFcw & X86_FCW_PM))
4068 fFsw |= X86_FSW_ES | X86_FSW_B;
4069
4070 pr32Dst->s.fSign = fSignIn;
4071 if (uRoundingAdd)
4072 { /* Zero */
4073 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4074 pr32Dst->s.uFraction = 0;
4075 }
4076 else
4077 { /* Max */
4078 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX - 1;
4079 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS) - 1;
4080 }
4081 return fFsw;
4082 }
4083
4084 /*
4085 * Normal or subnormal number.
4086 */
4087 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4088 uint64_t uMantissaOut = uMantissaIn;
4089 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4090 || (uMantissaIn & RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS))
4091 || fRoundedOff != uRoundingAdd)
4092 {
4093 uMantissaOut = uMantissaIn + uRoundingAdd;
4094 if (uMantissaOut >= uMantissaIn)
4095 { /* likely */ }
4096 else
4097 {
4098 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4099 iExponentOut++;
4100 Assert(iExponentOut < RTFLOAT32U_EXP_MAX); /* checked above */
4101 fFsw |= X86_FSW_C1;
4102 }
4103 }
4104 else
4105 uMantissaOut = uMantissaIn;
4106
4107 /* Truncate the mantissa and set the return value. */
4108 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS;
4109
4110 pr32Dst->s.uFraction = (uint32_t)uMantissaOut; /* Note! too big for bitfield if normal. */
4111 pr32Dst->s.uExponent = iExponentOut;
4112 pr32Dst->s.fSign = fSignIn;
4113
4114 /* Set status flags realted to rounding. */
4115 if (fRoundedOff)
4116 {
4117 fFsw |= X86_FSW_PE;
4118 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS)))
4119 fFsw |= X86_FSW_C1;
4120 if (!(fFcw & X86_FCW_PM))
4121 fFsw |= X86_FSW_ES | X86_FSW_B;
4122 }
4123
4124 return fFsw;
4125}
4126
4127
4128/**
4129 * @note Exact same logic as iemAImpl_fst_r80_to_r64.
4130 */
4131IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r32,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4132 PRTFLOAT32U pr32Dst, PCRTFLOAT80U pr80Src))
4133{
4134 uint16_t const fFcw = pFpuState->FCW;
4135 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4136 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4137 fFsw = iemAImpl_StoreNormalR80AsR32(pr80Src->s.fSign, pr80Src->s.uMantissa,
4138 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr32Dst);
4139 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4140 {
4141 pr32Dst->s.fSign = pr80Src->s.fSign;
4142 pr32Dst->s.uExponent = 0;
4143 pr32Dst->s.uFraction = 0;
4144 Assert(RTFLOAT32U_IS_ZERO(pr32Dst));
4145 }
4146 else if (RTFLOAT80U_IS_INF(pr80Src))
4147 {
4148 pr32Dst->s.fSign = pr80Src->s.fSign;
4149 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4150 pr32Dst->s.uFraction = 0;
4151 Assert(RTFLOAT32U_IS_INF(pr32Dst));
4152 }
4153 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4154 {
4155 /* Mapped to +/-QNaN */
4156 pr32Dst->s.fSign = pr80Src->s.fSign;
4157 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4158 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4159 }
4160 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4161 {
4162 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4163 if (fFcw & X86_FCW_IM)
4164 {
4165 pr32Dst->s.fSign = 1;
4166 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4167 pr32Dst->s.uFraction = RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4168 fFsw |= X86_FSW_IE;
4169 }
4170 else
4171 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4172 }
4173 else if (RTFLOAT80U_IS_NAN(pr80Src))
4174 {
4175 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4176 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4177 {
4178 pr32Dst->s.fSign = pr80Src->s.fSign;
4179 pr32Dst->s.uExponent = RTFLOAT32U_EXP_MAX;
4180 pr32Dst->s.uFraction = (uint32_t)(pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS));
4181 pr32Dst->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
4182 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4183 fFsw |= X86_FSW_IE;
4184 }
4185 else
4186 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4187 }
4188 else
4189 {
4190 /* Denormal values causes both an underflow and precision exception. */
4191 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4192 if (fFcw & X86_FCW_UM)
4193 {
4194 pr32Dst->s.fSign = pr80Src->s.fSign;
4195 pr32Dst->s.uExponent = 0;
4196 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4197 {
4198 pr32Dst->s.uFraction = 1;
4199 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4200 if (!(fFcw & X86_FCW_PM))
4201 fFsw |= X86_FSW_ES | X86_FSW_B;
4202 }
4203 else
4204 {
4205 pr32Dst->s.uFraction = 0;
4206 fFsw |= X86_FSW_UE | X86_FSW_PE;
4207 if (!(fFcw & X86_FCW_PM))
4208 fFsw |= X86_FSW_ES | X86_FSW_B;
4209 }
4210 }
4211 else
4212 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4213 }
4214 *pu16FSW = fFsw;
4215}
4216
4217
4218/**
4219 * Helper for storing a deconstructed and normal R80 value as a 64-bit one.
4220 *
4221 * This uses the rounding rules indicated by fFcw and returns updated fFsw.
4222 *
4223 * @returns Updated FPU status word value.
4224 * @param fSignIn Incoming sign indicator.
4225 * @param uMantissaIn Incoming mantissa (dot between bit 63 and 62).
4226 * @param iExponentIn Unbiased exponent.
4227 * @param fFcw The FPU control word.
4228 * @param fFsw Prepped FPU status word, i.e. exceptions and C1 clear.
4229 * @param pr64Dst Where to return the output value, if one should be
4230 * returned.
4231 *
4232 * @note Tailored as a helper for iemAImpl_fst_r80_to_r64 right now.
4233 * @note Exact same logic as iemAImpl_StoreNormalR80AsR32.
4234 */
4235static uint16_t iemAImpl_StoreNormalR80AsR64(bool fSignIn, uint64_t uMantissaIn, int32_t iExponentIn,
4236 uint16_t fFcw, uint16_t fFsw, PRTFLOAT64U pr64Dst)
4237{
4238 uint64_t const fRoundingOffMask = RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS) - 1; /* 0x7ff */
4239 uint32_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4240 ? RT_BIT_64(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS - 1) /* 0x400 */
4241 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4242 ? fRoundingOffMask
4243 : 0;
4244 uint32_t fRoundedOff = uMantissaIn & fRoundingOffMask;
4245
4246 /*
4247 * Deal with potential overflows/underflows first, optimizing for none.
4248 * 0 and MAX are used for special values; MAX-1 may be rounded up to MAX.
4249 */
4250 int32_t iExponentOut = (int32_t)iExponentIn + RTFLOAT64U_EXP_BIAS;
4251 if ((uint32_t)iExponentOut - 1 < (uint32_t)(RTFLOAT64U_EXP_MAX - 3))
4252 { /* likely? */ }
4253 /*
4254 * Underflow if the exponent zero or negative. This is attempted mapped
4255 * to a subnormal number when possible, with some additional trickery ofc.
4256 */
4257 else if (iExponentOut <= 0)
4258 {
4259 bool const fIsTiny = iExponentOut < 0
4260 || UINT64_MAX - uMantissaIn > uRoundingAdd;
4261 if (!(fFcw & X86_FCW_UM) && fIsTiny)
4262 /* Note! 754-1985 sec 7.4 has something about bias adjust of 1536 here, not in 2008 & 2019. Perhaps only 8087 & 287? */
4263 return fFsw | X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4264
4265 if (iExponentOut <= 0)
4266 {
4267 uMantissaIn = iExponentOut <= -63
4268 ? uMantissaIn != 0
4269 : (uMantissaIn >> (-iExponentOut + 1)) | ((uMantissaIn & (RT_BIT_64(-iExponentOut + 1) - 1)) != 0);
4270 fRoundedOff = uMantissaIn & fRoundingOffMask;
4271 if (fRoundedOff && fIsTiny)
4272 fFsw |= X86_FSW_UE;
4273 iExponentOut = 0;
4274 }
4275 }
4276 /*
4277 * Overflow if at or above max exponent value or if we will reach max
4278 * when rounding. Will return +/-zero or +/-max value depending on
4279 * whether we're rounding or not.
4280 */
4281 else if ( iExponentOut >= RTFLOAT64U_EXP_MAX
4282 || ( iExponentOut == RTFLOAT64U_EXP_MAX - 1
4283 && UINT64_MAX - uMantissaIn <= uRoundingAdd))
4284 {
4285 fFsw |= X86_FSW_OE;
4286 if (!(fFcw & X86_FCW_OM))
4287 return fFsw | X86_FSW_ES | X86_FSW_B;
4288 fFsw |= X86_FSW_PE;
4289 if (uRoundingAdd)
4290 fFsw |= X86_FSW_C1;
4291 if (!(fFcw & X86_FCW_PM))
4292 fFsw |= X86_FSW_ES | X86_FSW_B;
4293
4294 pr64Dst->s64.fSign = fSignIn;
4295 if (uRoundingAdd)
4296 { /* Zero */
4297 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4298 pr64Dst->s64.uFraction = 0;
4299 }
4300 else
4301 { /* Max */
4302 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX - 1;
4303 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS) - 1;
4304 }
4305 return fFsw;
4306 }
4307
4308 /*
4309 * Normal or subnormal number.
4310 */
4311 /* Do rounding - just truncate in near mode when midway on an even outcome. */
4312 uint64_t uMantissaOut = uMantissaIn;
4313 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
4314 || (uMantissaIn & RT_BIT_32(RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS))
4315 || fRoundedOff != uRoundingAdd)
4316 {
4317 uMantissaOut = uMantissaIn + uRoundingAdd;
4318 if (uMantissaOut >= uMantissaIn)
4319 { /* likely */ }
4320 else
4321 {
4322 uMantissaOut >>= 1; /* (We don't need to add bit 63 here (the integer bit), as it will be chopped off below.) */
4323 iExponentOut++;
4324 Assert(iExponentOut < RTFLOAT64U_EXP_MAX); /* checked above */
4325 fFsw |= X86_FSW_C1;
4326 }
4327 }
4328 else
4329 uMantissaOut = uMantissaIn;
4330
4331 /* Truncate the mantissa and set the return value. */
4332 uMantissaOut >>= RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS;
4333
4334 pr64Dst->s64.uFraction = uMantissaOut; /* Note! too big for bitfield if normal. */
4335 pr64Dst->s64.uExponent = iExponentOut;
4336 pr64Dst->s64.fSign = fSignIn;
4337
4338 /* Set status flags realted to rounding. */
4339 if (fRoundedOff)
4340 {
4341 fFsw |= X86_FSW_PE;
4342 if (uMantissaOut > (uMantissaIn >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS)))
4343 fFsw |= X86_FSW_C1;
4344 if (!(fFcw & X86_FCW_PM))
4345 fFsw |= X86_FSW_ES | X86_FSW_B;
4346 }
4347
4348 return fFsw;
4349}
4350
4351
4352/**
4353 * @note Exact same logic as iemAImpl_fst_r80_to_r32.
4354 */
4355IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r64,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4356 PRTFLOAT64U pr64Dst, PCRTFLOAT80U pr80Src))
4357{
4358 uint16_t const fFcw = pFpuState->FCW;
4359 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT) | (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4360 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4361 fFsw = iemAImpl_StoreNormalR80AsR64(pr80Src->s.fSign, pr80Src->s.uMantissa,
4362 (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS, fFcw, fFsw, pr64Dst);
4363 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4364 {
4365 pr64Dst->s64.fSign = pr80Src->s.fSign;
4366 pr64Dst->s64.uExponent = 0;
4367 pr64Dst->s64.uFraction = 0;
4368 Assert(RTFLOAT64U_IS_ZERO(pr64Dst));
4369 }
4370 else if (RTFLOAT80U_IS_INF(pr80Src))
4371 {
4372 pr64Dst->s64.fSign = pr80Src->s.fSign;
4373 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4374 pr64Dst->s64.uFraction = 0;
4375 Assert(RTFLOAT64U_IS_INF(pr64Dst));
4376 }
4377 else if (RTFLOAT80U_IS_INDEFINITE(pr80Src))
4378 {
4379 /* Mapped to +/-QNaN */
4380 pr64Dst->s64.fSign = pr80Src->s.fSign;
4381 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4382 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4383 }
4384 else if (RTFLOAT80U_IS_PSEUDO_INF(pr80Src) || RTFLOAT80U_IS_UNNORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Src))
4385 {
4386 /* Pseudo-Inf / Pseudo-Nan / Unnormal -> QNaN (during load, probably) */
4387 if (fFcw & X86_FCW_IM)
4388 {
4389 pr64Dst->s64.fSign = 1;
4390 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4391 pr64Dst->s64.uFraction = RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4392 fFsw |= X86_FSW_IE;
4393 }
4394 else
4395 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;;
4396 }
4397 else if (RTFLOAT80U_IS_NAN(pr80Src))
4398 {
4399 /* IM applies to signalled NaN input only. Everything is converted to quiet NaN. */
4400 if ((fFcw & X86_FCW_IM) || !RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4401 {
4402 pr64Dst->s64.fSign = pr80Src->s.fSign;
4403 pr64Dst->s64.uExponent = RTFLOAT64U_EXP_MAX;
4404 pr64Dst->s64.uFraction = pr80Src->sj64.uFraction >> (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
4405 pr64Dst->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
4406 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Src))
4407 fFsw |= X86_FSW_IE;
4408 }
4409 else
4410 fFsw |= X86_FSW_IE | X86_FSW_ES | X86_FSW_B;
4411 }
4412 else
4413 {
4414 /* Denormal values causes both an underflow and precision exception. */
4415 Assert(RTFLOAT80U_IS_DENORMAL(pr80Src) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src));
4416 if (fFcw & X86_FCW_UM)
4417 {
4418 pr64Dst->s64.fSign = pr80Src->s.fSign;
4419 pr64Dst->s64.uExponent = 0;
4420 if ((fFcw & X86_FCW_RC_MASK) == (!pr80Src->s.fSign ? X86_FCW_RC_UP : X86_FCW_RC_DOWN))
4421 {
4422 pr64Dst->s64.uFraction = 1;
4423 fFsw |= X86_FSW_UE | X86_FSW_PE | X86_FSW_C1;
4424 if (!(fFcw & X86_FCW_PM))
4425 fFsw |= X86_FSW_ES | X86_FSW_B;
4426 }
4427 else
4428 {
4429 pr64Dst->s64.uFraction = 0;
4430 fFsw |= X86_FSW_UE | X86_FSW_PE;
4431 if (!(fFcw & X86_FCW_PM))
4432 fFsw |= X86_FSW_ES | X86_FSW_B;
4433 }
4434 }
4435 else
4436 fFsw |= X86_FSW_UE | X86_FSW_ES | X86_FSW_B;
4437 }
4438 *pu16FSW = fFsw;
4439}
4440
4441
4442IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4443 PRTFLOAT80U pr80Dst, PCRTFLOAT80U pr80Src))
4444{
4445 /*
4446 * FPU status word:
4447 * - TOP is irrelevant, but we must match x86 assembly version (0).
4448 * - C1 is always cleared as we don't have any stack overflows.
4449 * - C0, C2, and C3 are undefined and Intel 10980XE does not touch them.
4450 */
4451 *pu16FSW = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3); /* see iemAImpl_fld1 */
4452 *pr80Dst = *pr80Src;
4453}
4454
4455
4456/*
4457 *
4458 * Mantissa:
4459 * 63 56 48 40 32 24 16 8 0
4460 * v v v v v v v v v
4461 * 1[.]111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000 1111 0000
4462 * \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \
4463 * Exp: 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
4464 *
4465 * int64_t has the same width, only bit 63 is the sign bit. So, the max we can map over
4466 * are bits 1 thru 63, dropping off bit 0, with an exponent of 62. The number of bits we
4467 * drop off from the mantissa increases with decreasing exponent, till an exponent of 0
4468 * where we'll drop off all but bit 63.
4469 */
4470#define EMIT_FIST(a_cBits, a_iType, a_iTypeMin, a_iTypeIndefinite) \
4471IEM_DECL_IMPL_DEF(void, iemAImpl_fist_r80_to_i ## a_cBits,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4472 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4473{ \
4474 uint16_t const fFcw = pFpuState->FCW; \
4475 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4476 bool const fSignIn = pr80Val->s.fSign; \
4477 \
4478 /* \
4479 * Deal with normal numbers first. \
4480 */ \
4481 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4482 { \
4483 uint64_t uMantissa = pr80Val->s.uMantissa; \
4484 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4485 \
4486 if ((uint32_t)iExponent <= a_cBits - 2) \
4487 { \
4488 unsigned const cShiftOff = 63 - iExponent; \
4489 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4490 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST \
4491 ? RT_BIT_64(cShiftOff - 1) \
4492 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP) \
4493 ? fRoundingOffMask \
4494 : 0; \
4495 uint64_t fRoundedOff = uMantissa & fRoundingOffMask; \
4496 \
4497 uMantissa >>= cShiftOff; \
4498 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff; \
4499 uMantissa += uRounding; \
4500 if (!(uMantissa & RT_BIT_64(a_cBits - 1))) \
4501 { \
4502 if (fRoundedOff) \
4503 { \
4504 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd) \
4505 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */ \
4506 else if (uRounding) \
4507 fFsw |= X86_FSW_C1; \
4508 fFsw |= X86_FSW_PE; \
4509 if (!(fFcw & X86_FCW_PM)) \
4510 fFsw |= X86_FSW_ES | X86_FSW_B; \
4511 } \
4512 \
4513 if (!fSignIn) \
4514 *piDst = (a_iType)uMantissa; \
4515 else \
4516 *piDst = -(a_iType)uMantissa; \
4517 } \
4518 else \
4519 { \
4520 /* overflowed after rounding. */ \
4521 AssertMsg(iExponent == a_cBits - 2 && uMantissa == RT_BIT_64(a_cBits - 1), \
4522 ("e=%d m=%#RX64 (org %#RX64) s=%d; shift=%d ro=%#RX64 rm=%#RX64 ra=%#RX64\n", iExponent, uMantissa, \
4523 pr80Val->s.uMantissa, fSignIn, cShiftOff, fRoundedOff, fRoundingOffMask, uRoundingAdd)); \
4524 \
4525 /* Special case for the integer minimum value. */ \
4526 if (fSignIn) \
4527 { \
4528 *piDst = a_iTypeMin; \
4529 fFsw |= X86_FSW_PE | X86_FSW_C1; \
4530 if (!(fFcw & X86_FCW_PM)) \
4531 fFsw |= X86_FSW_ES | X86_FSW_B; \
4532 } \
4533 else \
4534 { \
4535 fFsw |= X86_FSW_IE; \
4536 if (fFcw & X86_FCW_IM) \
4537 *piDst = a_iTypeMin; \
4538 else \
4539 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4540 } \
4541 } \
4542 } \
4543 /* \
4544 * Tiny sub-zero numbers. \
4545 */ \
4546 else if (iExponent < 0) \
4547 { \
4548 if (!fSignIn) \
4549 { \
4550 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4551 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4552 { \
4553 *piDst = 1; \
4554 fFsw |= X86_FSW_C1; \
4555 } \
4556 else \
4557 *piDst = 0; \
4558 } \
4559 else \
4560 { \
4561 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP \
4562 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO \
4563 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST)) \
4564 *piDst = 0; \
4565 else \
4566 { \
4567 *piDst = -1; \
4568 fFsw |= X86_FSW_C1; \
4569 } \
4570 } \
4571 fFsw |= X86_FSW_PE; \
4572 if (!(fFcw & X86_FCW_PM)) \
4573 fFsw |= X86_FSW_ES | X86_FSW_B; \
4574 } \
4575 /* \
4576 * Special MIN case. \
4577 */ \
4578 else if ( fSignIn && iExponent == a_cBits - 1 \
4579 && ( a_cBits < 64 && (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_DOWN \
4580 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4581 : uMantissa == RT_BIT_64(63))) \
4582 { \
4583 *piDst = a_iTypeMin; \
4584 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4585 { \
4586 fFsw |= X86_FSW_PE; \
4587 if (!(fFcw & X86_FCW_PM)) \
4588 fFsw |= X86_FSW_ES | X86_FSW_B; \
4589 } \
4590 } \
4591 /* \
4592 * Too large/small number outside the target integer range. \
4593 */ \
4594 else \
4595 { \
4596 fFsw |= X86_FSW_IE; \
4597 if (fFcw & X86_FCW_IM) \
4598 *piDst = a_iTypeIndefinite; \
4599 else \
4600 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4601 } \
4602 } \
4603 /* \
4604 * Map both +0 and -0 to integer zero (signless/+). \
4605 */ \
4606 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4607 *piDst = 0; \
4608 /* \
4609 * Denormals are just really tiny sub-zero numbers that are either rounded \
4610 * to zero, 1 or -1 depending on sign and rounding control. \
4611 */ \
4612 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4613 { \
4614 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)) \
4615 *piDst = 0; \
4616 else \
4617 { \
4618 *piDst = fSignIn ? -1 : 1; \
4619 fFsw |= X86_FSW_C1; \
4620 } \
4621 fFsw |= X86_FSW_PE; \
4622 if (!(fFcw & X86_FCW_PM)) \
4623 fFsw |= X86_FSW_ES | X86_FSW_B; \
4624 } \
4625 /* \
4626 * All other special values are considered invalid arguments and result \
4627 * in an IE exception and indefinite value if masked. \
4628 */ \
4629 else \
4630 { \
4631 fFsw |= X86_FSW_IE; \
4632 if (fFcw & X86_FCW_IM) \
4633 *piDst = a_iTypeIndefinite; \
4634 else \
4635 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4636 } \
4637 *pu16FSW = fFsw; \
4638}
4639EMIT_FIST(64, int64_t, INT64_MIN, X86_FPU_INT64_INDEFINITE)
4640EMIT_FIST(32, int32_t, INT32_MIN, X86_FPU_INT32_INDEFINITE)
4641EMIT_FIST(16, int16_t, INT16_MIN, X86_FPU_INT16_INDEFINITE)
4642
4643#endif /*IEM_WITHOUT_ASSEMBLY */
4644
4645
4646/*
4647 * The FISTT instruction was added with SSE3 and are a lot simpler than FIST.
4648 *
4649 * The 16-bit version is a bit peculiar, though, as it seems to be raising IE
4650 * as if it was the 32-bit version (i.e. starting with exp 31 instead of 15),
4651 * thus the @a a_cBitsIn.
4652 */
4653#define EMIT_FISTT(a_cBits, a_cBitsIn, a_iType, a_iTypeMin, a_iTypeMax, a_iTypeIndefinite, a_Suffix, a_fIntelVersion) \
4654IEM_DECL_IMPL_DEF(void, RT_CONCAT3(iemAImpl_fistt_r80_to_i,a_cBits,a_Suffix),(PCX86FXSTATE pFpuState, uint16_t *pu16FSW, \
4655 a_iType *piDst, PCRTFLOAT80U pr80Val)) \
4656{ \
4657 uint16_t const fFcw = pFpuState->FCW; \
4658 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)); \
4659 bool const fSignIn = pr80Val->s.fSign; \
4660 \
4661 /* \
4662 * Deal with normal numbers first. \
4663 */ \
4664 if (RTFLOAT80U_IS_NORMAL(pr80Val)) \
4665 { \
4666 uint64_t uMantissa = pr80Val->s.uMantissa; \
4667 int32_t iExponent = (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS; \
4668 \
4669 if ((uint32_t)iExponent <= a_cBitsIn - 2) \
4670 { \
4671 unsigned const cShiftOff = 63 - iExponent; \
4672 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1; \
4673 uint64_t const fRoundedOff = uMantissa & fRoundingOffMask; \
4674 uMantissa >>= cShiftOff; \
4675 /*Assert(!(uMantissa & RT_BIT_64(a_cBits - 1)));*/ \
4676 if (!fSignIn) \
4677 *piDst = (a_iType)uMantissa; \
4678 else \
4679 *piDst = -(a_iType)uMantissa; \
4680 \
4681 if (fRoundedOff) \
4682 { \
4683 fFsw |= X86_FSW_PE; \
4684 if (!(fFcw & X86_FCW_PM)) \
4685 fFsw |= X86_FSW_ES | X86_FSW_B; \
4686 } \
4687 } \
4688 /* \
4689 * Tiny sub-zero numbers. \
4690 */ \
4691 else if (iExponent < 0) \
4692 { \
4693 *piDst = 0; \
4694 fFsw |= X86_FSW_PE; \
4695 if (!(fFcw & X86_FCW_PM)) \
4696 fFsw |= X86_FSW_ES | X86_FSW_B; \
4697 } \
4698 /* \
4699 * Special MIN case. \
4700 */ \
4701 else if ( fSignIn && iExponent == a_cBits - 1 \
4702 && (a_cBits < 64 \
4703 ? uMantissa < (RT_BIT_64(63) | RT_BIT_64(65 - a_cBits)) \
4704 : uMantissa == RT_BIT_64(63)) ) \
4705 { \
4706 *piDst = a_iTypeMin; \
4707 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4708 { \
4709 fFsw |= X86_FSW_PE; \
4710 if (!(fFcw & X86_FCW_PM)) \
4711 fFsw |= X86_FSW_ES | X86_FSW_B; \
4712 } \
4713 } \
4714 /* \
4715 * Figure this weirdness. \
4716 */ \
4717 else if (0 /* huh? gone? */ && a_cBits == 16 && fSignIn && iExponent == 31 && uMantissa < UINT64_C(0x8000100000000000) ) \
4718 { \
4719 *piDst = 0; \
4720 if (uMantissa & (RT_BIT_64(64 - a_cBits + 1) - 1)) \
4721 { \
4722 fFsw |= X86_FSW_PE; \
4723 if (!(fFcw & X86_FCW_PM)) \
4724 fFsw |= X86_FSW_ES | X86_FSW_B; \
4725 } \
4726 } \
4727 /* \
4728 * Too large/small number outside the target integer range. \
4729 */ \
4730 else \
4731 { \
4732 fFsw |= X86_FSW_IE; \
4733 if (fFcw & X86_FCW_IM) \
4734 *piDst = a_iTypeIndefinite; \
4735 else \
4736 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4737 } \
4738 } \
4739 /* \
4740 * Map both +0 and -0 to integer zero (signless/+). \
4741 */ \
4742 else if (RTFLOAT80U_IS_ZERO(pr80Val)) \
4743 *piDst = 0; \
4744 /* \
4745 * Denormals are just really tiny sub-zero numbers that are trucated to zero. \
4746 */ \
4747 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) || RTFLOAT80U_IS_DENORMAL(pr80Val)) \
4748 { \
4749 *piDst = 0; \
4750 fFsw |= X86_FSW_PE; \
4751 if (!(fFcw & X86_FCW_PM)) \
4752 fFsw |= X86_FSW_ES | X86_FSW_B; \
4753 } \
4754 /* \
4755 * All other special values are considered invalid arguments and result \
4756 * in an IE exception and indefinite value if masked. \
4757 */ \
4758 else \
4759 { \
4760 fFsw |= X86_FSW_IE; \
4761 if (fFcw & X86_FCW_IM) \
4762 *piDst = a_iTypeIndefinite; \
4763 else \
4764 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT); \
4765 } \
4766 *pu16FSW = fFsw; \
4767}
4768#if defined(IEM_WITHOUT_ASSEMBLY)
4769EMIT_FISTT(64, 64, int64_t, INT64_MIN, INT64_MAX, X86_FPU_INT64_INDEFINITE, RT_NOTHING, 1)
4770EMIT_FISTT(32, 32, int32_t, INT32_MIN, INT32_MAX, X86_FPU_INT32_INDEFINITE, RT_NOTHING, 1)
4771EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, RT_NOTHING, 1)
4772#endif
4773EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _intel, 1)
4774EMIT_FISTT(16, 16, int16_t, INT16_MIN, INT16_MAX, X86_FPU_INT16_INDEFINITE, _amd, 0)
4775
4776
4777#if defined(IEM_WITHOUT_ASSEMBLY)
4778
4779IEM_DECL_IMPL_DEF(void, iemAImpl_fst_r80_to_d80,(PCX86FXSTATE pFpuState, uint16_t *pu16FSW,
4780 PRTPBCD80U pd80Dst, PCRTFLOAT80U pr80Src))
4781{
4782 /*static RTPBCD80U const s_ad80MaxMin[2] = { RTPBCD80U_INIT_MAX(), RTPBCD80U_INIT_MIN() };*/
4783 static RTPBCD80U const s_ad80Zeros[2] = { RTPBCD80U_INIT_ZERO(0), RTPBCD80U_INIT_ZERO(1) };
4784 static RTPBCD80U const s_ad80One[2] = { RTPBCD80U_INIT_C(0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1),
4785 RTPBCD80U_INIT_C(1, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,1) };
4786 static RTPBCD80U const s_d80Indefinite = RTPBCD80U_INIT_INDEFINITE();
4787
4788 uint16_t const fFcw = pFpuState->FCW;
4789 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3));
4790 bool const fSignIn = pr80Src->s.fSign;
4791
4792 /*
4793 * Deal with normal numbers first.
4794 */
4795 if (RTFLOAT80U_IS_NORMAL(pr80Src))
4796 {
4797 uint64_t uMantissa = pr80Src->s.uMantissa;
4798 int32_t iExponent = (int32_t)pr80Src->s.uExponent - RTFLOAT80U_EXP_BIAS;
4799 if ( (uint32_t)iExponent <= 58
4800 || ((uint32_t)iExponent == 59 && uMantissa <= UINT64_C(0xde0b6b3a763fffff)) )
4801 {
4802 unsigned const cShiftOff = 63 - iExponent;
4803 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
4804 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
4805 ? RT_BIT_64(cShiftOff - 1)
4806 : (fFcw & X86_FCW_RC_MASK) == (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
4807 ? fRoundingOffMask
4808 : 0;
4809 uint64_t fRoundedOff = uMantissa & fRoundingOffMask;
4810
4811 uMantissa >>= cShiftOff;
4812 uint64_t const uRounding = (fRoundedOff + uRoundingAdd) >> cShiftOff;
4813 uMantissa += uRounding;
4814 if (uMantissa <= (uint64_t)RTPBCD80U_MAX)
4815 {
4816 if (fRoundedOff)
4817 {
4818 if ((uMantissa & 1) && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST && fRoundedOff == uRoundingAdd)
4819 uMantissa &= ~(uint64_t)1; /* round to even number if equal distance between up/down. */
4820 else if (uRounding)
4821 fFsw |= X86_FSW_C1;
4822 fFsw |= X86_FSW_PE;
4823 if (!(fFcw & X86_FCW_PM))
4824 fFsw |= X86_FSW_ES | X86_FSW_B;
4825 }
4826
4827 pd80Dst->s.fSign = fSignIn;
4828 pd80Dst->s.uPad = 0;
4829 for (size_t iPair = 0; iPair < RT_ELEMENTS(pd80Dst->s.abPairs); iPair++)
4830 {
4831 unsigned const uDigits = uMantissa % 100;
4832 uMantissa /= 100;
4833 uint8_t const bLo = uDigits % 10;
4834 uint8_t const bHi = uDigits / 10;
4835 pd80Dst->s.abPairs[iPair] = RTPBCD80U_MAKE_PAIR(bHi, bLo);
4836 }
4837 }
4838 else
4839 {
4840 /* overflowed after rounding. */
4841 fFsw |= X86_FSW_IE;
4842 if (fFcw & X86_FCW_IM)
4843 *pd80Dst = s_d80Indefinite;
4844 else
4845 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4846 }
4847 }
4848 /*
4849 * Tiny sub-zero numbers.
4850 */
4851 else if (iExponent < 0)
4852 {
4853 if (!fSignIn)
4854 {
4855 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4856 || (iExponent == -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4857 {
4858 *pd80Dst = s_ad80One[fSignIn];
4859 fFsw |= X86_FSW_C1;
4860 }
4861 else
4862 *pd80Dst = s_ad80Zeros[fSignIn];
4863 }
4864 else
4865 {
4866 if ( (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_UP
4867 || (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_ZERO
4868 || (iExponent < -1 && (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST))
4869 *pd80Dst = s_ad80Zeros[fSignIn];
4870 else
4871 {
4872 *pd80Dst = s_ad80One[fSignIn];
4873 fFsw |= X86_FSW_C1;
4874 }
4875 }
4876 fFsw |= X86_FSW_PE;
4877 if (!(fFcw & X86_FCW_PM))
4878 fFsw |= X86_FSW_ES | X86_FSW_B;
4879 }
4880 /*
4881 * Too large/small number outside the target integer range.
4882 */
4883 else
4884 {
4885 fFsw |= X86_FSW_IE;
4886 if (fFcw & X86_FCW_IM)
4887 *pd80Dst = s_d80Indefinite;
4888 else
4889 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4890 }
4891 }
4892 /*
4893 * Map both +0 and -0 to integer zero (signless/+).
4894 */
4895 else if (RTFLOAT80U_IS_ZERO(pr80Src))
4896 *pd80Dst = s_ad80Zeros[fSignIn];
4897 /*
4898 * Denormals are just really tiny sub-zero numbers that are either rounded
4899 * to zero, 1 or -1 depending on sign and rounding control.
4900 */
4901 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Src) || RTFLOAT80U_IS_DENORMAL(pr80Src))
4902 {
4903 if ((fFcw & X86_FCW_RC_MASK) != (fSignIn ? X86_FCW_RC_DOWN : X86_FCW_RC_UP))
4904 *pd80Dst = s_ad80Zeros[fSignIn];
4905 else
4906 {
4907 *pd80Dst = s_ad80One[fSignIn];
4908 fFsw |= X86_FSW_C1;
4909 }
4910 fFsw |= X86_FSW_PE;
4911 if (!(fFcw & X86_FCW_PM))
4912 fFsw |= X86_FSW_ES | X86_FSW_B;
4913 }
4914 /*
4915 * All other special values are considered invalid arguments and result
4916 * in an IE exception and indefinite value if masked.
4917 */
4918 else
4919 {
4920 fFsw |= X86_FSW_IE;
4921 if (fFcw & X86_FCW_IM)
4922 *pd80Dst = s_d80Indefinite;
4923 else
4924 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
4925 }
4926 *pu16FSW = fFsw;
4927}
4928
4929
4930/*********************************************************************************************************************************
4931* FPU Helpers *
4932*********************************************************************************************************************************/
4933AssertCompileSize(RTFLOAT128U, 16);
4934AssertCompileSize(RTFLOAT80U, 10);
4935AssertCompileSize(RTFLOAT64U, 8);
4936AssertCompileSize(RTFLOAT32U, 4);
4937
4938/**
4939 * Normalizes a possible pseudo-normal value.
4940 *
4941 * Psuedo-normal values are some oddities from the 8087 & 287 days. They are
4942 * denormals with the J-bit set, so they can simply be rewritten as 2**-16382,
4943 * i.e. changing uExponent from 0 to 1.
4944 *
4945 * This macro will declare a RTFLOAT80U with the name given by
4946 * @a a_r80ValNormalized and update the @a a_pr80Val variable to point to it if
4947 * a normalization was performed.
4948 *
4949 * @note This must be applied before calling SoftFloat with a value that couldbe
4950 * a pseudo-denormal, as SoftFloat doesn't handle pseudo-denormals
4951 * correctly.
4952 */
4953#define IEM_NORMALIZE_PSEUDO_DENORMAL(a_pr80Val, a_r80ValNormalized) \
4954 RTFLOAT80U a_r80ValNormalized; \
4955 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(a_pr80Val)) \
4956 { \
4957 a_r80ValNormalized = *a_pr80Val; \
4958 a_r80ValNormalized.s.uExponent = 1; \
4959 a_pr80Val = &a_r80ValNormalized; \
4960 } else do {} while (0)
4961
4962#ifdef IEM_WITH_FLOAT128_FOR_FPU
4963
4964DECLINLINE(int) iemFpuF128SetRounding(uint16_t fFcw)
4965{
4966 int fNew;
4967 switch (fFcw & X86_FCW_RC_MASK)
4968 {
4969 default:
4970 case X86_FCW_RC_NEAREST: fNew = FE_TONEAREST; break;
4971 case X86_FCW_RC_ZERO: fNew = FE_TOWARDZERO; break;
4972 case X86_FCW_RC_UP: fNew = FE_UPWARD; break;
4973 case X86_FCW_RC_DOWN: fNew = FE_DOWNWARD; break;
4974 }
4975 int fOld = fegetround();
4976 fesetround(fNew);
4977 return fOld;
4978}
4979
4980
4981DECLINLINE(void) iemFpuF128RestoreRounding(int fOld)
4982{
4983 fesetround(fOld);
4984}
4985
4986DECLINLINE(_Float128) iemFpuF128FromFloat80(PCRTFLOAT80U pr80Val, uint16_t fFcw)
4987{
4988 RT_NOREF(fFcw);
4989 RTFLOAT128U Tmp;
4990 Tmp.s2.uSignAndExponent = pr80Val->s2.uSignAndExponent;
4991 Tmp.s2.uFractionHigh = (uint16_t)((pr80Val->s2.uMantissa & (RT_BIT_64(63) - 1)) >> 48);
4992 Tmp.s2.uFractionMid = (uint32_t)((pr80Val->s2.uMantissa & UINT32_MAX) >> 16);
4993 Tmp.s2.uFractionLow = pr80Val->s2.uMantissa << 48;
4994 if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
4995 {
4996 Assert(Tmp.s.uExponent == 0);
4997 Tmp.s2.uSignAndExponent++;
4998 }
4999 return *(_Float128 *)&Tmp;
5000}
5001
5002
5003DECLINLINE(uint16_t) iemFpuF128ToFloat80(PRTFLOAT80U pr80Dst, _Float128 rd128ValSrc, uint16_t fFcw, uint16_t fFsw)
5004{
5005 RT_NOREF(fFcw);
5006 RTFLOAT128U Tmp;
5007 *(_Float128 *)&Tmp = rd128ValSrc;
5008 ASMCompilerBarrier();
5009 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5010 {
5011 pr80Dst->s.fSign = Tmp.s64.fSign;
5012 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5013 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5014 | Tmp.s64.uFractionLo >> (64 - 15);
5015
5016 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5017 unsigned const cShiftOff = 64 - 15;
5018 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5019 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5020 if (uRoundedOff)
5021 {
5022 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5023 ? RT_BIT_64(cShiftOff - 1)
5024 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5025 ? fRoundingOffMask
5026 : 0;
5027 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5028 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5029 || uRoundedOff != uRoundingAdd)
5030 {
5031 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5032 {
5033 uFraction += 1;
5034 if (!(uFraction & RT_BIT_64(63)))
5035 { /* likely */ }
5036 else
5037 {
5038 uFraction >>= 1;
5039 pr80Dst->s.uExponent++;
5040 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5041 return fFsw;
5042 }
5043 fFsw |= X86_FSW_C1;
5044 }
5045 }
5046 fFsw |= X86_FSW_PE;
5047 if (!(fFcw & X86_FCW_PM))
5048 fFsw |= X86_FSW_ES | X86_FSW_B;
5049 }
5050 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5051 }
5052 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5053 {
5054 pr80Dst->s.fSign = Tmp.s64.fSign;
5055 pr80Dst->s.uExponent = 0;
5056 pr80Dst->s.uMantissa = 0;
5057 }
5058 else if (RTFLOAT128U_IS_INF(&Tmp))
5059 {
5060 pr80Dst->s.fSign = Tmp.s64.fSign;
5061 pr80Dst->s.uExponent = 0;
5062 pr80Dst->s.uMantissa = 0;
5063 }
5064 return fFsw;
5065}
5066
5067
5068#else /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5069
5070/** Initializer for the SoftFloat state structure. */
5071# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(a_fFcw) \
5072 { \
5073 softfloat_tininess_afterRounding, \
5074 ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
5075 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_UP ? (uint8_t)softfloat_round_max \
5076 : ((a_fFcw) & X86_FCW_RC_MASK) == X86_FCW_RC_DOWN ? (uint8_t)softfloat_round_min \
5077 : (uint8_t)softfloat_round_minMag, \
5078 0, \
5079 (uint8_t)((a_fFcw) & X86_FCW_XCPT_MASK), \
5080 ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_53 ? (uint8_t)64 \
5081 : ((a_fFcw) & X86_FCW_PC_MASK) == X86_FCW_PC_24 ? (uint8_t)32 : (uint8_t)80 \
5082 }
5083
5084/** Returns updated FSW from a SoftFloat state and exception mask (FCW). */
5085# define IEM_SOFTFLOAT_STATE_TO_FSW(a_fFsw, a_pSoftState, a_fFcw) \
5086 ( (a_fFsw) \
5087 | (uint16_t)(((a_pSoftState)->exceptionFlags & softfloat_flag_c1) << 2) \
5088 | ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) \
5089 | ( ((a_pSoftState)->exceptionFlags & X86_FSW_XCPT_MASK) & (~(a_fFcw) & X86_FSW_XCPT_MASK) \
5090 ? X86_FSW_ES | X86_FSW_B : 0) )
5091
5092
5093DECLINLINE(float128_t) iemFpuSoftF128Precision(float128_t r128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5094{
5095 RT_NOREF(fFcw);
5096 Assert(cBits > 64);
5097# if 0 /* rounding does not seem to help */
5098 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5099 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5100 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5101 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5102 {
5103 uint64_t uOld = r128.v[0];
5104 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5105 if (r128.v[0] < uOld)
5106 r128.v[1] += 1;
5107 }
5108# else
5109 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5110# endif
5111 return r128;
5112}
5113
5114
5115DECLINLINE(float128_t) iemFpuSoftF128PrecisionIprt(PCRTFLOAT128U pr128, unsigned cBits, uint16_t fFcw = X86_FCW_RC_NEAREST)
5116{
5117 RT_NOREF(fFcw);
5118 Assert(cBits > 64);
5119# if 0 /* rounding does not seem to help, not even on constants */
5120 float128_t r128 = { pr128->au64[0], pr128->au64[1] };
5121 uint64_t off = r128.v[0] & (RT_BIT_64(1 + 112 - cBits) - 1);
5122 r128.v[0] &= ~(RT_BIT_64(1 + 112 - cBits) - 1);
5123 if (off >= RT_BIT_64(1 + 112 - cBits - 1)
5124 && (r128.v[0] & RT_BIT_64(1 + 112 - cBits)))
5125 {
5126 uint64_t uOld = r128.v[0];
5127 r128.v[0] += RT_BIT_64(1 + 112 - cBits);
5128 if (r128.v[0] < uOld)
5129 r128.v[1] += 1;
5130 }
5131 return r128;
5132# else
5133 float128_t r128 = { { pr128->au64[0] & ~(RT_BIT_64(1 + 112 - cBits) - 1), pr128->au64[1] } };
5134 return r128;
5135# endif
5136}
5137
5138
5139# if 0 /* unused */
5140DECLINLINE(float128_t) iemFpuSoftF128FromIprt(PCRTFLOAT128U pr128)
5141{
5142 float128_t r128 = { { pr128->au64[0], pr128->au64[1] } };
5143 return r128;
5144}
5145# endif
5146
5147
5148/** Converts a 80-bit floating point value to SoftFloat 128-bit floating point. */
5149DECLINLINE(float128_t) iemFpuSoftF128FromFloat80(PCRTFLOAT80U pr80Val)
5150{
5151 extFloat80_t Tmp;
5152 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5153 Tmp.signif = pr80Val->s2.uMantissa;
5154 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
5155 return extF80_to_f128(Tmp, &Ignored);
5156}
5157
5158
5159/**
5160 * Converts from the packed IPRT 80-bit floating point (RTFLOAT80U) format to
5161 * the SoftFloat extended 80-bit floating point format (extFloat80_t).
5162 *
5163 * This is only a structure format conversion, nothing else.
5164 */
5165DECLINLINE(extFloat80_t) iemFpuSoftF80FromIprt(PCRTFLOAT80U pr80Val)
5166{
5167 extFloat80_t Tmp;
5168 Tmp.signExp = pr80Val->s2.uSignAndExponent;
5169 Tmp.signif = pr80Val->s2.uMantissa;
5170 return Tmp;
5171}
5172
5173
5174/**
5175 * Converts from SoftFloat extended 80-bit floating point format (extFloat80_t)
5176 * to the packed IPRT 80-bit floating point (RTFLOAT80U) format.
5177 *
5178 * This is only a structure format conversion, nothing else.
5179 */
5180DECLINLINE(PRTFLOAT80U) iemFpuSoftF80ToIprt(PRTFLOAT80U pr80Dst, extFloat80_t const r80XSrc)
5181{
5182 pr80Dst->s2.uSignAndExponent = r80XSrc.signExp;
5183 pr80Dst->s2.uMantissa = r80XSrc.signif;
5184 return pr80Dst;
5185}
5186
5187
5188DECLINLINE(uint16_t) iemFpuSoftF128ToFloat80(PRTFLOAT80U pr80Dst, float128_t r128Src, uint16_t fFcw, uint16_t fFsw)
5189{
5190 RT_NOREF(fFcw);
5191 RTFLOAT128U Tmp;
5192 *(float128_t *)&Tmp = r128Src;
5193 ASMCompilerBarrier();
5194
5195 if (RTFLOAT128U_IS_NORMAL(&Tmp))
5196 {
5197 pr80Dst->s.fSign = Tmp.s64.fSign;
5198 pr80Dst->s.uExponent = Tmp.s64.uExponent;
5199 uint64_t uFraction = Tmp.s64.uFractionHi << (63 - 48)
5200 | Tmp.s64.uFractionLo >> (64 - 15);
5201
5202 /* Do rounding - just truncate in near mode when midway on an even outcome. */
5203 unsigned const cShiftOff = 64 - 15;
5204 uint64_t const fRoundingOffMask = RT_BIT_64(cShiftOff) - 1;
5205 uint64_t const uRoundedOff = Tmp.s64.uFractionLo & fRoundingOffMask;
5206 if (uRoundedOff)
5207 {
5208 uint64_t const uRoundingAdd = (fFcw & X86_FCW_RC_MASK) == X86_FCW_RC_NEAREST
5209 ? RT_BIT_64(cShiftOff - 1)
5210 : (fFcw & X86_FCW_RC_MASK) == (Tmp.s64.fSign ? X86_FCW_RC_DOWN : X86_FCW_RC_UP)
5211 ? fRoundingOffMask
5212 : 0;
5213 if ( (fFcw & X86_FCW_RC_MASK) != X86_FCW_RC_NEAREST
5214 || (Tmp.s64.uFractionLo & RT_BIT_64(cShiftOff))
5215 || uRoundedOff != uRoundingAdd)
5216 {
5217 if ((uRoundedOff + uRoundingAdd) >> cShiftOff)
5218 {
5219 uFraction += 1;
5220 if (!(uFraction & RT_BIT_64(63)))
5221 { /* likely */ }
5222 else
5223 {
5224 uFraction >>= 1;
5225 pr80Dst->s.uExponent++;
5226 if (pr80Dst->s.uExponent == RTFLOAT64U_EXP_MAX)
5227 return fFsw;
5228 }
5229 fFsw |= X86_FSW_C1;
5230 }
5231 }
5232 fFsw |= X86_FSW_PE;
5233 if (!(fFcw & X86_FCW_PM))
5234 fFsw |= X86_FSW_ES | X86_FSW_B;
5235 }
5236
5237 pr80Dst->s.uMantissa = RT_BIT_64(63) | uFraction;
5238 }
5239 else if (RTFLOAT128U_IS_ZERO(&Tmp))
5240 {
5241 pr80Dst->s.fSign = Tmp.s64.fSign;
5242 pr80Dst->s.uExponent = 0;
5243 pr80Dst->s.uMantissa = 0;
5244 }
5245 else if (RTFLOAT128U_IS_INF(&Tmp))
5246 {
5247 pr80Dst->s.fSign = Tmp.s64.fSign;
5248 pr80Dst->s.uExponent = 0x7fff;
5249 pr80Dst->s.uMantissa = 0;
5250 }
5251 return fFsw;
5252}
5253
5254
5255/**
5256 * Helper for transfering exception and C1 to FSW and setting the result value
5257 * accordingly.
5258 *
5259 * @returns Updated FSW.
5260 * @param pSoftState The SoftFloat state following the operation.
5261 * @param r80XResult The result of the SoftFloat operation.
5262 * @param pr80Result Where to store the result for IEM.
5263 * @param fFcw The FPU control word.
5264 * @param fFsw The FSW before the operation, with necessary bits
5265 * cleared and such.
5266 * @param pr80XcptResult Alternative return value for use an unmasked \#IE is
5267 * raised.
5268 */
5269DECLINLINE(uint16_t) iemFpuSoftStateAndF80ToFswAndIprtResult(softfloat_state_t const *pSoftState, extFloat80_t r80XResult,
5270 PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw,
5271 PCRTFLOAT80U pr80XcptResult)
5272{
5273 fFsw |= (pSoftState->exceptionFlags & X86_FSW_XCPT_MASK)
5274 | (uint16_t)((pSoftState->exceptionFlags & softfloat_flag_c1) << 2);
5275 if (fFsw & ~fFcw & X86_FSW_XCPT_MASK)
5276 fFsw |= X86_FSW_ES | X86_FSW_B;
5277
5278 if (!(fFsw & ~fFcw & (X86_FSW_IE | X86_FSW_DE)))
5279 iemFpuSoftF80ToIprt(pr80Result, r80XResult);
5280 else
5281 {
5282 fFsw &= ~(X86_FSW_OE | X86_FSW_UE | X86_FSW_PE | X86_FSW_ZE | X86_FSW_C1);
5283 *pr80Result = *pr80XcptResult;
5284 }
5285 return fFsw;
5286}
5287
5288
5289/**
5290 * Helper doing polynomial evaluation using Horner's method.
5291 *
5292 * See https://en.wikipedia.org/wiki/Horner%27s_method for details.
5293 */
5294float128_t iemFpuSoftF128HornerPoly(float128_t z, PCRTFLOAT128U g_par128HornerConsts, size_t cHornerConsts,
5295 unsigned cPrecision, softfloat_state_t *pSoftState)
5296{
5297 Assert(cHornerConsts > 1);
5298 size_t i = cHornerConsts - 1;
5299 float128_t r128Result = iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision);
5300 while (i-- > 0)
5301 {
5302 r128Result = iemFpuSoftF128Precision(f128_mul(r128Result, z, pSoftState), cPrecision);
5303 r128Result = f128_add(r128Result, iemFpuSoftF128PrecisionIprt(&g_par128HornerConsts[i], cPrecision), pSoftState);
5304 r128Result = iemFpuSoftF128Precision(r128Result, cPrecision);
5305 }
5306 return r128Result;
5307}
5308
5309#endif /* !IEM_WITH_FLOAT128_FOR_FPU - SoftFloat */
5310
5311
5312/**
5313 * Composes a normalized and rounded RTFLOAT80U result from a 192 bit wide
5314 * mantissa, exponent and sign.
5315 *
5316 * @returns Updated FSW.
5317 * @param pr80Dst Where to return the composed value.
5318 * @param fSign The sign.
5319 * @param puMantissa The mantissa, 256-bit type but the to 64-bits are
5320 * ignored and should be zero. This will probably be
5321 * modified during normalization and rounding.
5322 * @param iExponent Unbiased exponent.
5323 * @param fFcw The FPU control word.
5324 * @param fFsw The FPU status word.
5325 */
5326static uint16_t iemFpuFloat80RoundAndComposeFrom192(PRTFLOAT80U pr80Dst, bool fSign, PRTUINT256U puMantissa,
5327 int32_t iExponent, uint16_t fFcw, uint16_t fFsw)
5328{
5329 AssertStmt(puMantissa->QWords.qw3 == 0, puMantissa->QWords.qw3 = 0);
5330
5331 iExponent += RTFLOAT80U_EXP_BIAS;
5332
5333 /* Do normalization if necessary and possible. */
5334 if (!(puMantissa->QWords.qw2 & RT_BIT_64(63)))
5335 {
5336 int cShift = 192 - RTUInt256BitCount(puMantissa);
5337 if (iExponent > cShift)
5338 iExponent -= cShift;
5339 else
5340 {
5341 if (fFcw & X86_FCW_UM)
5342 {
5343 if (iExponent > 0)
5344 cShift = --iExponent;
5345 else
5346 cShift = 0;
5347 }
5348 iExponent -= cShift;
5349 }
5350 RTUInt256AssignShiftLeft(puMantissa, cShift);
5351 }
5352
5353 /* Do rounding. */
5354 uint64_t uMantissa = puMantissa->QWords.qw2;
5355 if (puMantissa->QWords.qw1 || puMantissa->QWords.qw0)
5356 {
5357 bool fAdd;
5358 switch (fFcw & X86_FCW_RC_MASK)
5359 {
5360 default: /* (for the simple-minded MSC which otherwise things fAdd would be used uninitialized) */
5361 case X86_FCW_RC_NEAREST:
5362 if (puMantissa->QWords.qw1 & RT_BIT_64(63))
5363 {
5364 if ( (uMantissa & 1)
5365 || puMantissa->QWords.qw0 != 0
5366 || puMantissa->QWords.qw1 != RT_BIT_64(63))
5367 {
5368 fAdd = true;
5369 break;
5370 }
5371 uMantissa &= ~(uint64_t)1;
5372 }
5373 fAdd = false;
5374 break;
5375 case X86_FCW_RC_ZERO:
5376 fAdd = false;
5377 break;
5378 case X86_FCW_RC_UP:
5379 fAdd = !fSign;
5380 break;
5381 case X86_FCW_RC_DOWN:
5382 fAdd = fSign;
5383 break;
5384 }
5385 if (fAdd)
5386 {
5387 uint64_t const uTmp = uMantissa;
5388 uMantissa = uTmp + 1;
5389 if (uMantissa < uTmp)
5390 {
5391 uMantissa >>= 1;
5392 uMantissa |= RT_BIT_64(63);
5393 iExponent++;
5394 }
5395 fFsw |= X86_FSW_C1;
5396 }
5397 fFsw |= X86_FSW_PE;
5398 if (!(fFcw & X86_FCW_PM))
5399 fFsw |= X86_FSW_ES | X86_FSW_B;
5400 }
5401
5402 /* Check for underflow (denormals). */
5403 if (iExponent <= 0)
5404 {
5405 if (fFcw & X86_FCW_UM)
5406 {
5407 if (uMantissa & RT_BIT_64(63))
5408 uMantissa >>= 1;
5409 iExponent = 0;
5410 }
5411 else
5412 {
5413 iExponent += RTFLOAT80U_EXP_BIAS_ADJUST;
5414 fFsw |= X86_FSW_ES | X86_FSW_B;
5415 }
5416 fFsw |= X86_FSW_UE;
5417 }
5418 /* Check for overflow */
5419 else if (iExponent >= RTFLOAT80U_EXP_MAX)
5420 {
5421 Assert(iExponent < RTFLOAT80U_EXP_MAX);
5422 }
5423
5424 /* Compose the result. */
5425 pr80Dst->s.uMantissa = uMantissa;
5426 pr80Dst->s.uExponent = iExponent;
5427 pr80Dst->s.fSign = fSign;
5428 return fFsw;
5429}
5430
5431
5432/**
5433 * See also iemAImpl_fld_r80_from_r32
5434 */
5435static uint16_t iemAImplConvertR32ToR80(PCRTFLOAT32U pr32Val, PRTFLOAT80U pr80Dst)
5436{
5437 uint16_t fFsw = 0;
5438 if (RTFLOAT32U_IS_NORMAL(pr32Val))
5439 {
5440 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5441 pr80Dst->sj64.fInteger = 1;
5442 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5443 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5444 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5445 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5446 }
5447 else if (RTFLOAT32U_IS_ZERO(pr32Val))
5448 {
5449 pr80Dst->s.fSign = pr32Val->s.fSign;
5450 pr80Dst->s.uExponent = 0;
5451 pr80Dst->s.uMantissa = 0;
5452 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5453 }
5454 else if (RTFLOAT32U_IS_SUBNORMAL(pr32Val))
5455 {
5456 /* Subnormal -> normalized + X86_FSW_DE return. */
5457 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5458 pr80Dst->sj64.fInteger = 1;
5459 unsigned const cExtraShift = RTFLOAT32U_FRACTION_BITS - ASMBitLastSetU32(pr32Val->s.uFraction);
5460 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5461 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS + cExtraShift + 1);
5462 pr80Dst->sj64.uExponent = pr32Val->s.uExponent - RTFLOAT32U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5463 fFsw = X86_FSW_DE;
5464 }
5465 else if (RTFLOAT32U_IS_INF(pr32Val))
5466 {
5467 pr80Dst->s.fSign = pr32Val->s.fSign;
5468 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5469 pr80Dst->s.uMantissa = RT_BIT_64(63);
5470 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5471 }
5472 else
5473 {
5474 Assert(RTFLOAT32U_IS_NAN(pr32Val));
5475 pr80Dst->sj64.fSign = pr32Val->s.fSign;
5476 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5477 pr80Dst->sj64.fInteger = 1;
5478 pr80Dst->sj64.uFraction = (uint64_t)pr32Val->s.uFraction
5479 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT32U_FRACTION_BITS);
5480 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5481 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val));
5482 }
5483 return fFsw;
5484}
5485
5486
5487/**
5488 * See also iemAImpl_fld_r80_from_r64
5489 */
5490static uint16_t iemAImplConvertR64ToR80(PCRTFLOAT64U pr64Val, PRTFLOAT80U pr80Dst)
5491{
5492 uint16_t fFsw = 0;
5493 if (RTFLOAT64U_IS_NORMAL(pr64Val))
5494 {
5495 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5496 pr80Dst->sj64.fInteger = 1;
5497 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5498 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS;
5499 Assert(RTFLOAT80U_IS_NORMAL(pr80Dst));
5500 }
5501 else if (RTFLOAT64U_IS_ZERO(pr64Val))
5502 {
5503 pr80Dst->s.fSign = pr64Val->s.fSign;
5504 pr80Dst->s.uExponent = 0;
5505 pr80Dst->s.uMantissa = 0;
5506 Assert(RTFLOAT80U_IS_ZERO(pr80Dst));
5507 }
5508 else if (RTFLOAT64U_IS_SUBNORMAL(pr64Val))
5509 {
5510 /* Subnormal values gets normalized. */
5511 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5512 pr80Dst->sj64.fInteger = 1;
5513 unsigned const cExtraShift = RTFLOAT64U_FRACTION_BITS - ASMBitLastSetU64(pr64Val->s64.uFraction);
5514 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction
5515 << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS + cExtraShift + 1);
5516 pr80Dst->sj64.uExponent = pr64Val->s.uExponent - RTFLOAT64U_EXP_BIAS + RTFLOAT80U_EXP_BIAS - cExtraShift;
5517 fFsw = X86_FSW_DE;
5518 }
5519 else if (RTFLOAT64U_IS_INF(pr64Val))
5520 {
5521 pr80Dst->s.fSign = pr64Val->s.fSign;
5522 pr80Dst->s.uExponent = RTFLOAT80U_EXP_MAX;
5523 pr80Dst->s.uMantissa = RT_BIT_64(63);
5524 Assert(RTFLOAT80U_IS_INF(pr80Dst));
5525 }
5526 else
5527 {
5528 /* Signalling and quiet NaNs, both turn into quiet ones when loaded (weird). */
5529 Assert(RTFLOAT64U_IS_NAN(pr64Val));
5530 pr80Dst->sj64.fSign = pr64Val->s.fSign;
5531 pr80Dst->sj64.uExponent = RTFLOAT80U_EXP_MAX;
5532 pr80Dst->sj64.fInteger = 1;
5533 pr80Dst->sj64.uFraction = pr64Val->s64.uFraction << (RTFLOAT80U_FRACTION_BITS - RTFLOAT64U_FRACTION_BITS);
5534 Assert(RTFLOAT80U_IS_NAN(pr80Dst));
5535 Assert(RTFLOAT80U_IS_SIGNALLING_NAN(pr80Dst) == RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val));
5536 }
5537 return fFsw;
5538}
5539
5540
5541/**
5542 * See also EMIT_FILD.
5543 */
5544#define EMIT_CONVERT_IXX_TO_R80(a_cBits) \
5545static PRTFLOAT80U iemAImplConvertI ## a_cBits ## ToR80(int ## a_cBits ## _t iVal, PRTFLOAT80U pr80Dst) \
5546{ \
5547 if (iVal == 0) \
5548 { \
5549 pr80Dst->s.fSign = 0; \
5550 pr80Dst->s.uExponent = 0; \
5551 pr80Dst->s.uMantissa = 0; \
5552 } \
5553 else \
5554 { \
5555 if (iVal > 0) \
5556 pr80Dst->s.fSign = 0; \
5557 else \
5558 { \
5559 pr80Dst->s.fSign = 1; \
5560 iVal = -iVal; \
5561 } \
5562 unsigned const cBits = ASMBitLastSetU ## a_cBits((uint ## a_cBits ## _t)iVal); \
5563 pr80Dst->s.uExponent = cBits - 1 + RTFLOAT80U_EXP_BIAS; \
5564 pr80Dst->s.uMantissa = (uint64_t)iVal << (RTFLOAT80U_FRACTION_BITS + 1 - cBits); \
5565 } \
5566 return pr80Dst; \
5567}
5568EMIT_CONVERT_IXX_TO_R80(16)
5569EMIT_CONVERT_IXX_TO_R80(32)
5570//EMIT_CONVERT_IXX_TO_R80(64)
5571
5572/** For implementing iemAImpl_fmul_r80_by_r64 and such. */
5573#define EMIT_R80_BY_R64(a_Name, a_fnR80ByR80, a_DenormalException) \
5574IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2)) \
5575{ \
5576 RTFLOAT80U r80Val2; \
5577 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2); \
5578 Assert(!fFsw || fFsw == X86_FSW_DE); \
5579 if (fFsw) \
5580 { \
5581 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5582 fFsw = 0; \
5583 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5584 { \
5585 pFpuRes->r80Result = *pr80Val1; \
5586 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5587 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5588 return; \
5589 } \
5590 } \
5591 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5592 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5593}
5594
5595/** For implementing iemAImpl_fmul_r80_by_r32 and such. */
5596#define EMIT_R80_BY_R32(a_Name, a_fnR80ByR80, a_DenormalException) \
5597IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2)) \
5598{ \
5599 RTFLOAT80U r80Val2; \
5600 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2); \
5601 Assert(!fFsw || fFsw == X86_FSW_DE); \
5602 if (fFsw) \
5603 { \
5604 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_NAN(pr80Val1) || (a_DenormalException)) \
5605 fFsw = 0; \
5606 else if (!(pFpuState->FCW & X86_FCW_DM)) \
5607 { \
5608 pFpuRes->r80Result = *pr80Val1; \
5609 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT) \
5610 | X86_FSW_DE | X86_FSW_ES | X86_FSW_B; \
5611 return; \
5612 } \
5613 } \
5614 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, &r80Val2); \
5615 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT) | fFsw; \
5616}
5617
5618/** For implementing iemAImpl_fimul_r80_by_i32 and such. */
5619#define EMIT_R80_BY_I32(a_Name, a_fnR80ByR80) \
5620IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2)) \
5621{ \
5622 RTFLOAT80U r80Val2; \
5623 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2)); \
5624 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5625}
5626
5627/** For implementing iemAImpl_fimul_r80_by_i16 and such. */
5628#define EMIT_R80_BY_I16(a_Name, a_fnR80ByR80) \
5629IEM_DECL_IMPL_DEF(void, a_Name,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2)) \
5630{ \
5631 RTFLOAT80U r80Val2; \
5632 a_fnR80ByR80(pFpuState, pFpuRes, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2)); \
5633 pFpuRes->FSW = (pFpuRes->FSW & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT); \
5634}
5635
5636
5637
5638/*********************************************************************************************************************************
5639* x86 FPU Division Operations *
5640*********************************************************************************************************************************/
5641
5642/** Worker for iemAImpl_fdiv_r80_by_r80 & iemAImpl_fdivr_r80_by_r80. */
5643static uint16_t iemAImpl_fdiv_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5644 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5645{
5646 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5647 {
5648 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5649 extFloat80_t r80XResult = extF80_div(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5650 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5651 }
5652 if (!RTFLOAT80U_IS_ZERO(pr80Val1))
5653 { /* Div by zero. */
5654 if (fFcw & X86_FCW_ZM)
5655 *pr80Result = g_ar80Infinity[pr80Val1->s.fSign != pr80Val2->s.fSign];
5656 else
5657 {
5658 *pr80Result = *pr80Val1Org;
5659 fFsw |= X86_FSW_ES | X86_FSW_B;
5660 }
5661 fFsw |= X86_FSW_ZE;
5662 }
5663 else
5664 { /* Invalid operand */
5665 if (fFcw & X86_FCW_IM)
5666 *pr80Result = g_r80Indefinite;
5667 else
5668 {
5669 *pr80Result = *pr80Val1Org;
5670 fFsw |= X86_FSW_ES | X86_FSW_B;
5671 }
5672 fFsw |= X86_FSW_IE;
5673 }
5674 return fFsw;
5675}
5676
5677
5678IEM_DECL_IMPL_DEF(void, iemAImpl_fdiv_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5679 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5680{
5681 uint16_t const fFcw = pFpuState->FCW;
5682 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5683
5684 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5685 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5686 {
5687 if (fFcw & X86_FCW_IM)
5688 pFpuRes->r80Result = g_r80Indefinite;
5689 else
5690 {
5691 pFpuRes->r80Result = *pr80Val1;
5692 fFsw |= X86_FSW_ES | X86_FSW_B;
5693 }
5694 fFsw |= X86_FSW_IE;
5695 }
5696 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5697 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5698 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5699 {
5700 if (fFcw & X86_FCW_DM)
5701 {
5702 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5703 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5704 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5705 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5706 }
5707 else
5708 {
5709 pFpuRes->r80Result = *pr80Val1;
5710 fFsw |= X86_FSW_ES | X86_FSW_B;
5711 }
5712 fFsw |= X86_FSW_DE;
5713 }
5714 /* SoftFloat can handle the rest: */
5715 else
5716 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5717
5718 pFpuRes->FSW = fFsw;
5719}
5720
5721
5722EMIT_R80_BY_R64(iemAImpl_fdiv_r80_by_r64, iemAImpl_fdiv_r80_by_r80, 0)
5723EMIT_R80_BY_R32(iemAImpl_fdiv_r80_by_r32, iemAImpl_fdiv_r80_by_r80, 0)
5724EMIT_R80_BY_I32(iemAImpl_fidiv_r80_by_i32, iemAImpl_fdiv_r80_by_r80)
5725EMIT_R80_BY_I16(iemAImpl_fidiv_r80_by_i16, iemAImpl_fdiv_r80_by_r80)
5726
5727
5728IEM_DECL_IMPL_DEF(void, iemAImpl_fdivr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5729 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5730{
5731 uint16_t const fFcw = pFpuState->FCW;
5732 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5733
5734 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5735 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5736 {
5737 if (fFcw & X86_FCW_IM)
5738 pFpuRes->r80Result = g_r80Indefinite;
5739 else
5740 {
5741 pFpuRes->r80Result = *pr80Val1;
5742 fFsw |= X86_FSW_ES | X86_FSW_B;
5743 }
5744 fFsw |= X86_FSW_IE;
5745 }
5746 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5747 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5748 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_ZERO(pr80Val1)) )
5749 {
5750 if (fFcw & X86_FCW_DM)
5751 {
5752 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5753 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5754 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5755 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5756 }
5757 else
5758 {
5759 pFpuRes->r80Result = *pr80Val1;
5760 fFsw |= X86_FSW_ES | X86_FSW_B;
5761 }
5762 fFsw |= X86_FSW_DE;
5763 }
5764 /* SoftFloat can handle the rest: */
5765 else
5766 fFsw = iemAImpl_fdiv_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5767
5768 pFpuRes->FSW = fFsw;
5769}
5770
5771
5772EMIT_R80_BY_R64(iemAImpl_fdivr_r80_by_r64, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5773EMIT_R80_BY_R32(iemAImpl_fdivr_r80_by_r32, iemAImpl_fdivr_r80_by_r80, RTFLOAT80U_IS_ZERO(pr80Val1))
5774EMIT_R80_BY_I32(iemAImpl_fidivr_r80_by_i32, iemAImpl_fdivr_r80_by_r80)
5775EMIT_R80_BY_I16(iemAImpl_fidivr_r80_by_i16, iemAImpl_fdivr_r80_by_r80)
5776
5777
5778/** Worker for iemAImpl_fprem_r80_by_r80 & iemAImpl_fprem1_r80_by_r80. */
5779static uint16_t iemAImpl_fprem_fprem1_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5780 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org, bool fLegacyInstr)
5781{
5782 if (!RTFLOAT80U_IS_ZERO(pr80Val2) || RTFLOAT80U_IS_NAN(pr80Val1) || RTFLOAT80U_IS_INF(pr80Val1))
5783 {
5784 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5785 uint16_t fCxFlags = 0;
5786 extFloat80_t r80XResult = extF80_partialRem(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2),
5787 fLegacyInstr ? softfloat_round_minMag : softfloat_round_near_even,
5788 &fCxFlags, &SoftState);
5789 Assert(!(fCxFlags & ~X86_FSW_C_MASK));
5790 fFsw = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5791 if ( !(fFsw & X86_FSW_IE)
5792 && !RTFLOAT80U_IS_NAN(pr80Result)
5793 && !RTFLOAT80U_IS_INDEFINITE(pr80Result))
5794 {
5795 fFsw &= ~(uint16_t)X86_FSW_C_MASK;
5796 fFsw |= fCxFlags & X86_FSW_C_MASK;
5797 }
5798 return fFsw;
5799 }
5800
5801 /* Invalid operand */
5802 if (fFcw & X86_FCW_IM)
5803 *pr80Result = g_r80Indefinite;
5804 else
5805 {
5806 *pr80Result = *pr80Val1Org;
5807 fFsw |= X86_FSW_ES | X86_FSW_B;
5808 }
5809 return fFsw | X86_FSW_IE;
5810}
5811
5812
5813static void iemAImpl_fprem_fprem1_r80_by_r80(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5814 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, bool fLegacyInstr)
5815{
5816 uint16_t const fFcw = pFpuState->FCW;
5817 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 /*| X86_FSW_C2*/ | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5818
5819 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals.
5820 In addition, we'd like to handle zero ST(1) now as SoftFloat returns Inf instead
5821 of Indefinite. (Note! There is no #Z like the footnotes to tables 3-31 and 3-32
5822 for the FPREM1 & FPREM1 instructions in the intel reference manual claims!) */
5823 if ( RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2)
5824 || (RTFLOAT80U_IS_ZERO(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INDEFINITE(pr80Val1)))
5825 {
5826 if (fFcw & X86_FCW_IM)
5827 pFpuRes->r80Result = g_r80Indefinite;
5828 else
5829 {
5830 pFpuRes->r80Result = *pr80Val1;
5831 fFsw |= X86_FSW_ES | X86_FSW_B;
5832 }
5833 fFsw |= X86_FSW_IE;
5834 }
5835 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs & /0 trumps denormals. */
5836 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2) && !RTFLOAT80U_IS_ZERO(pr80Val2))
5837 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1) && !RTFLOAT80U_IS_INF(pr80Val1)) )
5838 {
5839 if (fFcw & X86_FCW_DM)
5840 {
5841 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5842 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5843 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5844 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5845 pr80Val1Org, fLegacyInstr);
5846 }
5847 else
5848 {
5849 pFpuRes->r80Result = *pr80Val1;
5850 fFsw |= X86_FSW_ES | X86_FSW_B;
5851 }
5852 fFsw |= X86_FSW_DE;
5853 }
5854 /* SoftFloat can handle the rest: */
5855 else
5856 fFsw = iemAImpl_fprem_fprem1_r80_by_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw,
5857 pr80Val1, fLegacyInstr);
5858
5859 pFpuRes->FSW = fFsw;
5860}
5861
5862
5863IEM_DECL_IMPL_DEF(void, iemAImpl_fprem_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5864 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5865{
5866 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, true /*fLegacyInstr*/);
5867}
5868
5869
5870IEM_DECL_IMPL_DEF(void, iemAImpl_fprem1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5871 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5872{
5873 iemAImpl_fprem_fprem1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2, false /*fLegacyInstr*/);
5874}
5875
5876
5877/*********************************************************************************************************************************
5878* x87 FPU Multiplication Operations *
5879*********************************************************************************************************************************/
5880
5881/** Worker for iemAImpl_fmul_r80_by_r80. */
5882static uint16_t iemAImpl_fmul_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5883 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5884{
5885 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5886 extFloat80_t r80XResult = extF80_mul(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5887 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5888}
5889
5890
5891IEM_DECL_IMPL_DEF(void, iemAImpl_fmul_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5892 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5893{
5894 uint16_t const fFcw = pFpuState->FCW;
5895 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5896
5897 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5898 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5899 {
5900 if (fFcw & X86_FCW_IM)
5901 pFpuRes->r80Result = g_r80Indefinite;
5902 else
5903 {
5904 pFpuRes->r80Result = *pr80Val1;
5905 fFsw |= X86_FSW_ES | X86_FSW_B;
5906 }
5907 fFsw |= X86_FSW_IE;
5908 }
5909 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5910 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5911 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5912 {
5913 if (fFcw & X86_FCW_DM)
5914 {
5915 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5916 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5917 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5918 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5919 }
5920 else
5921 {
5922 pFpuRes->r80Result = *pr80Val1;
5923 fFsw |= X86_FSW_ES | X86_FSW_B;
5924 }
5925 fFsw |= X86_FSW_DE;
5926 }
5927 /* SoftFloat can handle the rest: */
5928 else
5929 fFsw = iemAImpl_fmul_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5930
5931 pFpuRes->FSW = fFsw;
5932}
5933
5934
5935EMIT_R80_BY_R64(iemAImpl_fmul_r80_by_r64, iemAImpl_fmul_r80_by_r80, 0)
5936EMIT_R80_BY_R32(iemAImpl_fmul_r80_by_r32, iemAImpl_fmul_r80_by_r80, 0)
5937EMIT_R80_BY_I32(iemAImpl_fimul_r80_by_i32, iemAImpl_fmul_r80_by_r80)
5938EMIT_R80_BY_I16(iemAImpl_fimul_r80_by_i16, iemAImpl_fmul_r80_by_r80)
5939
5940
5941/*********************************************************************************************************************************
5942* x87 FPU Addition *
5943*********************************************************************************************************************************/
5944
5945/** Worker for iemAImpl_fadd_r80_by_r80. */
5946static uint16_t iemAImpl_fadd_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
5947 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
5948{
5949 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
5950 extFloat80_t r80XResult = extF80_add(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
5951 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
5952}
5953
5954
5955IEM_DECL_IMPL_DEF(void, iemAImpl_fadd_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
5956 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
5957{
5958 uint16_t const fFcw = pFpuState->FCW;
5959 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
5960
5961 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
5962 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
5963 {
5964 if (fFcw & X86_FCW_IM)
5965 pFpuRes->r80Result = g_r80Indefinite;
5966 else
5967 {
5968 pFpuRes->r80Result = *pr80Val1;
5969 fFsw |= X86_FSW_ES | X86_FSW_B;
5970 }
5971 fFsw |= X86_FSW_IE;
5972 }
5973 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
5974 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
5975 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
5976 {
5977 if (fFcw & X86_FCW_DM)
5978 {
5979 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
5980 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
5981 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
5982 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
5983 }
5984 else
5985 {
5986 pFpuRes->r80Result = *pr80Val1;
5987 fFsw |= X86_FSW_ES | X86_FSW_B;
5988 }
5989 fFsw |= X86_FSW_DE;
5990 }
5991 /* SoftFloat can handle the rest: */
5992 else
5993 fFsw = iemAImpl_fadd_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
5994
5995 pFpuRes->FSW = fFsw;
5996}
5997
5998
5999EMIT_R80_BY_R64(iemAImpl_fadd_r80_by_r64, iemAImpl_fadd_r80_by_r80, 0)
6000EMIT_R80_BY_R32(iemAImpl_fadd_r80_by_r32, iemAImpl_fadd_r80_by_r80, 0)
6001EMIT_R80_BY_I32(iemAImpl_fiadd_r80_by_i32, iemAImpl_fadd_r80_by_r80)
6002EMIT_R80_BY_I16(iemAImpl_fiadd_r80_by_i16, iemAImpl_fadd_r80_by_r80)
6003
6004
6005/*********************************************************************************************************************************
6006* x87 FPU Subtraction *
6007*********************************************************************************************************************************/
6008
6009/** Worker for iemAImpl_fsub_r80_by_r80 and iemAImpl_fsubr_r80_by_r80. */
6010static uint16_t iemAImpl_fsub_f80_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result,
6011 uint16_t fFcw, uint16_t fFsw, PCRTFLOAT80U pr80Val1Org)
6012{
6013 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
6014 extFloat80_t r80XResult = extF80_sub(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
6015 return iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, pr80Result, fFcw, fFsw, pr80Val1Org);
6016}
6017
6018
6019IEM_DECL_IMPL_DEF(void, iemAImpl_fsub_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6020 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6021{
6022 uint16_t const fFcw = pFpuState->FCW;
6023 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6024
6025 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6026 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6027 {
6028 if (fFcw & X86_FCW_IM)
6029 pFpuRes->r80Result = g_r80Indefinite;
6030 else
6031 {
6032 pFpuRes->r80Result = *pr80Val1;
6033 fFsw |= X86_FSW_ES | X86_FSW_B;
6034 }
6035 fFsw |= X86_FSW_IE;
6036 }
6037 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6038 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6039 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6040 {
6041 if (fFcw & X86_FCW_DM)
6042 {
6043 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6044 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6045 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6046 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6047 }
6048 else
6049 {
6050 pFpuRes->r80Result = *pr80Val1;
6051 fFsw |= X86_FSW_ES | X86_FSW_B;
6052 }
6053 fFsw |= X86_FSW_DE;
6054 }
6055 /* SoftFloat can handle the rest: */
6056 else
6057 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6058
6059 pFpuRes->FSW = fFsw;
6060}
6061
6062
6063EMIT_R80_BY_R64(iemAImpl_fsub_r80_by_r64, iemAImpl_fsub_r80_by_r80, 0)
6064EMIT_R80_BY_R32(iemAImpl_fsub_r80_by_r32, iemAImpl_fsub_r80_by_r80, 0)
6065EMIT_R80_BY_I32(iemAImpl_fisub_r80_by_i32, iemAImpl_fsub_r80_by_r80)
6066EMIT_R80_BY_I16(iemAImpl_fisub_r80_by_i16, iemAImpl_fsub_r80_by_r80)
6067
6068
6069/* Same as iemAImpl_fsub_r80_by_r80, but with input operands switched. */
6070IEM_DECL_IMPL_DEF(void, iemAImpl_fsubr_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6071 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6072{
6073 uint16_t const fFcw = pFpuState->FCW;
6074 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6075
6076 /* SoftFloat does not check for Pseudo-Infinity, Pseudo-Nan and Unnormals. */
6077 if (RTFLOAT80U_IS_387_INVALID(pr80Val1) || RTFLOAT80U_IS_387_INVALID(pr80Val2))
6078 {
6079 if (fFcw & X86_FCW_IM)
6080 pFpuRes->r80Result = g_r80Indefinite;
6081 else
6082 {
6083 pFpuRes->r80Result = *pr80Val1;
6084 fFsw |= X86_FSW_ES | X86_FSW_B;
6085 }
6086 fFsw |= X86_FSW_IE;
6087 }
6088 /* SoftFloat does not check for denormals and certainly not report them to us. NaNs trumps denormals. */
6089 else if ( (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val1) && !RTFLOAT80U_IS_NAN(pr80Val2))
6090 || (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val2) && !RTFLOAT80U_IS_NAN(pr80Val1)) )
6091 {
6092 if (fFcw & X86_FCW_DM)
6093 {
6094 PCRTFLOAT80U const pr80Val1Org = pr80Val1;
6095 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val1, r80Val1Normalized);
6096 IEM_NORMALIZE_PSEUDO_DENORMAL(pr80Val2, r80Val2Normalized);
6097 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1Org);
6098 }
6099 else
6100 {
6101 pFpuRes->r80Result = *pr80Val1;
6102 fFsw |= X86_FSW_ES | X86_FSW_B;
6103 }
6104 fFsw |= X86_FSW_DE;
6105 }
6106 /* SoftFloat can handle the rest: */
6107 else
6108 fFsw = iemAImpl_fsub_f80_r80_worker(pr80Val2, pr80Val1, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
6109
6110 pFpuRes->FSW = fFsw;
6111}
6112
6113
6114EMIT_R80_BY_R64(iemAImpl_fsubr_r80_by_r64, iemAImpl_fsubr_r80_by_r80, 0)
6115EMIT_R80_BY_R32(iemAImpl_fsubr_r80_by_r32, iemAImpl_fsubr_r80_by_r80, 0)
6116EMIT_R80_BY_I32(iemAImpl_fisubr_r80_by_i32, iemAImpl_fsubr_r80_by_r80)
6117EMIT_R80_BY_I16(iemAImpl_fisubr_r80_by_i16, iemAImpl_fsubr_r80_by_r80)
6118
6119
6120/*********************************************************************************************************************************
6121* x87 FPU Trigometric Operations *
6122*********************************************************************************************************************************/
6123static uint16_t iemAImpl_fpatan_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PIEMFPURESULT pFpuRes, uint16_t fFcw, uint16_t fFsw)
6124{
6125 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6126 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
6127 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
6128 extFloat80_t v;
6129 (void)fFcw;
6130
6131 v = extF80_atan2(y, x, &SoftState);
6132
6133 iemFpuSoftF80ToIprt(&pFpuRes->r80Result, v);
6134 return fFsw;
6135}
6136
6137IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6138 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6139{
6140 uint16_t const fFcw = pFpuState->FCW;
6141 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
6142
6143 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2))
6144 {
6145 fFsw = iemAImpl_fpatan_r80_by_r80_normal(pr80Val1, pr80Val2, pFpuRes, fFcw, fFsw);
6146
6147 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
6148 if (!(fFcw & X86_FCW_PM))
6149 fFsw |= X86_FSW_ES | X86_FSW_B;
6150 }
6151 else
6152 {
6153 fFsw |= X86_FSW_IE;
6154 if (!(fFcw & X86_FCW_IM))
6155 {
6156 pFpuRes->r80Result = *pr80Val2;
6157 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
6158 }
6159 else
6160 {
6161 pFpuRes->r80Result = g_r80Indefinite;
6162 fFsw |= (7 << X86_FSW_TOP_SHIFT);
6163 }
6164 }
6165
6166 pFpuRes->FSW = fFsw;
6167}
6168#endif /* IEM_WITHOUT_ASSEMBLY */
6169
6170IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6171 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6172{
6173 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6174}
6175
6176IEM_DECL_IMPL_DEF(void, iemAImpl_fpatan_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
6177 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6178{
6179 iemAImpl_fpatan_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
6180}
6181
6182
6183#if defined(IEM_WITHOUT_ASSEMBLY)
6184static uint16_t iemAImpl_fptan_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6185{
6186 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6187 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6188 extFloat80_t v;
6189 (void)fFcw;
6190
6191 v = extF80_tan(x, &SoftState);
6192
6193 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, v);
6194 return fFsw;
6195}
6196
6197IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6198{
6199 uint16_t const fFcw = pFpuState->FCW;
6200 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
6201
6202 if (RTFLOAT80U_IS_ZERO(pr80Val))
6203 {
6204 pFpuResTwo->r80Result1 = *pr80Val;
6205 pFpuResTwo->r80Result2 = g_ar80One[0];
6206 }
6207 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6208 {
6209 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6210 {
6211 fFsw |= X86_FSW_C2 | (7 << X86_FSW_TOP_SHIFT);
6212 pFpuResTwo->r80Result1 = *pr80Val;
6213 }
6214 else
6215 {
6216 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6217 {
6218 pFpuResTwo->r80Result1 = *pr80Val;
6219 }
6220 else
6221 {
6222 fFsw = iemAImpl_fptan_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6223 }
6224
6225 pFpuResTwo->r80Result2 = g_ar80One[0];
6226
6227 fFsw |= X86_FSW_PE;
6228 if (!(fFcw & X86_FCW_PM))
6229 fFsw |= X86_FSW_ES | X86_FSW_B;
6230 }
6231 }
6232 else
6233 {
6234 fFsw |= X86_FSW_IE;
6235 if (!(fFcw & X86_FCW_IM))
6236 fFsw |= X86_FSW_ES | X86_FSW_B | (7 << X86_FSW_TOP_SHIFT);
6237 }
6238
6239 pFpuResTwo->FSW = fFsw;
6240}
6241#endif /* IEM_WITHOUT_ASSEMBLY */
6242
6243IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6244{
6245 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6246}
6247
6248IEM_DECL_IMPL_DEF(void, iemAImpl_fptan_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6249{
6250 iemAImpl_fptan_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6251}
6252
6253#ifdef IEM_WITHOUT_ASSEMBLY
6254
6255static uint16_t iemAImpl_fsin_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6256{
6257 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6258 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6259 extFloat80_t v;
6260 (void)fFcw;
6261
6262 v = extF80_sin(x, &SoftState);
6263
6264 iemFpuSoftF80ToIprt(pr80Result, v);
6265
6266 return fFsw;
6267}
6268
6269IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6270{
6271 uint16_t const fFcw = pFpuState->FCW;
6272 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6273
6274 if (RTFLOAT80U_IS_ZERO(pr80Val))
6275 {
6276 pFpuRes->r80Result = *pr80Val;
6277 }
6278 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6279 {
6280 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6281 {
6282 fFsw |= X86_FSW_C2;
6283 pFpuRes->r80Result = *pr80Val;
6284 }
6285 else
6286 {
6287 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6288 {
6289 pFpuRes->r80Result = *pr80Val;
6290 }
6291 else
6292 {
6293 fFsw = iemAImpl_fsin_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6294 }
6295 fFsw |= X86_FSW_PE;
6296 if (!(fFcw & X86_FCW_PM))
6297 fFsw |= X86_FSW_ES | X86_FSW_B;
6298 }
6299 }
6300 else if (RTFLOAT80U_IS_INF(pr80Val))
6301 {
6302 fFsw |= X86_FSW_IE;
6303 if (!(fFcw & X86_FCW_IM))
6304 {
6305 fFsw |= X86_FSW_ES | X86_FSW_B;
6306 pFpuRes->r80Result = *pr80Val;
6307 }
6308 else
6309 {
6310 pFpuRes->r80Result = g_r80Indefinite;
6311 }
6312 }
6313 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6314 {
6315 fFsw |= X86_FSW_DE;
6316
6317 if (fFcw & X86_FCW_DM)
6318 {
6319 if (fFcw & X86_FCW_UM)
6320 {
6321 pFpuRes->r80Result = *pr80Val;
6322 }
6323 else
6324 {
6325 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6326 uint64_t uMantissa = pr80Val->s.uMantissa;
6327 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6328
6329 uExponent = 64 - uExponent;
6330 uMantissa <<= uExponent;
6331 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6332
6333 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
6334 pFpuRes->r80Result.s.uMantissa = uMantissa;
6335 pFpuRes->r80Result.s.uExponent = uExponent;
6336 }
6337
6338 fFsw |= X86_FSW_UE | X86_FSW_PE;
6339
6340 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6341 {
6342 /* All the exceptions are masked. */
6343 }
6344 else
6345 {
6346 fFsw |= X86_FSW_ES | X86_FSW_B;
6347 }
6348 }
6349 else
6350 {
6351 pFpuRes->r80Result = *pr80Val;
6352
6353 fFsw |= X86_FSW_ES | X86_FSW_B;
6354 }
6355 }
6356 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6357 {
6358 pFpuRes->r80Result = *pr80Val;
6359 fFsw |= X86_FSW_DE;
6360
6361 if (fFcw & X86_FCW_DM)
6362 {
6363 if (fFcw & X86_FCW_PM)
6364 {
6365 fFsw |= X86_FSW_PE;
6366 }
6367 else
6368 {
6369 fFsw |= X86_FSW_ES | X86_FSW_B | X86_FSW_PE;
6370 }
6371
6372 pFpuRes->r80Result.sj64.uExponent = 1;
6373 }
6374 else
6375 {
6376 fFsw |= X86_FSW_ES | X86_FSW_B;
6377 }
6378 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6379 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6380 {
6381 pFpuRes->r80Result = *pr80Val;
6382 } else {
6383 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6384 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6385 && (fFcw & X86_FCW_IM))
6386 pFpuRes->r80Result = g_r80Indefinite;
6387 else
6388 {
6389 pFpuRes->r80Result = *pr80Val;
6390 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6391 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6392 }
6393
6394 fFsw |= X86_FSW_IE;
6395 if (!(fFcw & X86_FCW_IM))
6396 fFsw |= X86_FSW_ES | X86_FSW_B;
6397 }
6398
6399 pFpuRes->FSW = fFsw;
6400}
6401#endif /* IEM_WITHOUT_ASSEMBLY */
6402
6403IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6404{
6405 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6406}
6407
6408IEM_DECL_IMPL_DEF(void, iemAImpl_fsin_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6409{
6410 iemAImpl_fsin_r80(pFpuState, pFpuRes, pr80Val);
6411}
6412
6413#ifdef IEM_WITHOUT_ASSEMBLY
6414
6415static uint16_t iemAImpl_fcos_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
6416{
6417 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6418 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6419 extFloat80_t v;
6420 (void)fFcw;
6421
6422 v = extF80_cos(x, &SoftState);
6423
6424 iemFpuSoftF80ToIprt(pr80Result, v);
6425
6426 return fFsw;
6427}
6428
6429IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6430{
6431 uint16_t const fFcw = pFpuState->FCW;
6432 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6433
6434 if (RTFLOAT80U_IS_ZERO(pr80Val))
6435 {
6436 pFpuRes->r80Result = g_ar80One[0];
6437 }
6438 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6439 {
6440 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6441 {
6442 fFsw |= X86_FSW_C2;
6443 pFpuRes->r80Result = *pr80Val;
6444 }
6445 else
6446 {
6447 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6448 {
6449 pFpuRes->r80Result = g_ar80One[0];
6450
6451 }
6452 else
6453 {
6454 fFsw = iemAImpl_fcos_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
6455 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6456 }
6457 fFsw |= X86_FSW_PE;
6458 if (!(fFcw & X86_FCW_PM))
6459 fFsw |= X86_FSW_ES | X86_FSW_B;
6460 }
6461 }
6462 else if (RTFLOAT80U_IS_INF(pr80Val))
6463 {
6464 fFsw |= X86_FSW_IE;
6465 if (!(fFcw & X86_FCW_IM))
6466 {
6467 fFsw |= X86_FSW_ES | X86_FSW_B;
6468 pFpuRes->r80Result = *pr80Val;
6469 }
6470 else
6471 {
6472 pFpuRes->r80Result = g_r80Indefinite;
6473 }
6474 }
6475 else if (RTFLOAT80U_IS_DENORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6476 {
6477 fFsw |= X86_FSW_DE;
6478
6479 if (fFcw & X86_FCW_DM)
6480 {
6481 pFpuRes->r80Result = g_ar80One[0];
6482
6483 if (fFcw & X86_FCW_PM)
6484 {
6485 fFsw |= X86_FSW_PE;
6486 }
6487 else
6488 {
6489 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6490 }
6491 }
6492 else
6493 {
6494 pFpuRes->r80Result = *pr80Val;
6495 fFsw |= X86_FSW_ES | X86_FSW_B;
6496 }
6497 } else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
6498 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6499 {
6500 pFpuRes->r80Result = *pr80Val;
6501 } else {
6502 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
6503 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6504 && (fFcw & X86_FCW_IM))
6505 pFpuRes->r80Result = g_r80Indefinite;
6506 else
6507 {
6508 pFpuRes->r80Result = *pr80Val;
6509 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
6510 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6511 }
6512
6513 fFsw |= X86_FSW_IE;
6514 if (!(fFcw & X86_FCW_IM))
6515 fFsw |= X86_FSW_ES | X86_FSW_B;
6516 }
6517
6518 pFpuRes->FSW = fFsw;
6519}
6520#endif /* IEM_WITHOUT_ASSEMBLY */
6521
6522IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6523{
6524 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6525}
6526
6527IEM_DECL_IMPL_DEF(void, iemAImpl_fcos_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
6528{
6529 iemAImpl_fcos_r80(pFpuState, pFpuRes, pr80Val);
6530}
6531
6532#ifdef IEM_WITHOUT_ASSEMBLY
6533
6534static uint16_t iemAImpl_fsincos_r80_r80_normal(PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val, uint16_t fFcw, uint16_t fFsw)
6535{
6536 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
6537 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val);
6538 extFloat80_t r80Sin, r80Cos;
6539 (void)fFcw;
6540
6541 extF80_sincos(x, &r80Sin, &r80Cos, &SoftState);
6542
6543 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, r80Sin);
6544 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result2, r80Cos);
6545
6546 return fFsw;
6547}
6548
6549IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6550{
6551 uint16_t const fFcw = pFpuState->FCW;
6552 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | /*X86_FSW_C2 |*/ X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
6553
6554 if (RTFLOAT80U_IS_ZERO(pr80Val))
6555 {
6556 pFpuResTwo->r80Result1 = *pr80Val;
6557 pFpuResTwo->r80Result2 = g_ar80One[0];
6558 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6559 }
6560 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6561 {
6562 if (pr80Val->s.uExponent >= RTFLOAT80U_EXP_BIAS + 63)
6563 {
6564 fFsw |= X86_FSW_C2;
6565
6566 if (fFcw & X86_FCW_IM)
6567 {
6568 pFpuResTwo->r80Result1 = g_r80Indefinite;
6569 }
6570 else
6571 {
6572 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6573 }
6574
6575 pFpuResTwo->r80Result2 = *pr80Val;
6576 }
6577 else
6578 {
6579 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6580
6581 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 63)
6582 {
6583 pFpuResTwo->r80Result1 = *pr80Val;
6584 pFpuResTwo->r80Result2 = g_ar80One[0];
6585 }
6586 else
6587 {
6588 fFsw = iemAImpl_fsincos_r80_r80_normal(pFpuResTwo, pr80Val, fFcw, fFsw);
6589 fFsw |= X86_FSW_C1; // TBD: If the inexact result was rounded up (C1 is set) or “not rounded up” (C1 is cleared).
6590 }
6591 fFsw |= X86_FSW_PE;
6592 if (!(fFcw & X86_FCW_PM))
6593 fFsw |= X86_FSW_ES | X86_FSW_B;
6594 }
6595 }
6596 else if (RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val))
6597 {
6598 fFsw |= X86_FSW_DE;
6599
6600 if (fFcw & X86_FCW_DM)
6601 {
6602 pFpuResTwo->r80Result1 = *pr80Val;
6603 pFpuResTwo->r80Result2 = g_ar80One[0];
6604 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6605
6606 if (fFcw & X86_FCW_PM)
6607 {
6608 fFsw |= X86_FSW_PE;
6609 }
6610 else
6611 {
6612 fFsw |= X86_FSW_PE | X86_FSW_ES | X86_FSW_B;
6613 }
6614
6615 pFpuResTwo->r80Result1.sj64.uExponent = 1;
6616 }
6617 else
6618 {
6619 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6620 pFpuResTwo->r80Result2 = *pr80Val;
6621 fFsw |= X86_FSW_ES | X86_FSW_B;
6622 }
6623 }
6624 else if (RTFLOAT80U_IS_DENORMAL(pr80Val))
6625 {
6626 fFsw |= X86_FSW_DE;
6627
6628 if (fFcw & X86_FCW_DM)
6629 {
6630 pFpuResTwo->r80Result2 = g_ar80One[0];
6631
6632 if (fFcw & X86_FCW_UM)
6633 {
6634 pFpuResTwo->r80Result1 = *pr80Val;
6635 }
6636 else
6637 {
6638 /* Underflow signalling as described at 7.4 section of 1985 IEEE 754*/
6639 uint64_t uMantissa = pr80Val->s.uMantissa;
6640 uint32_t uExponent = ASMBitLastSetU64(uMantissa);
6641
6642 uExponent = 64 - uExponent;
6643 uMantissa <<= uExponent;
6644 uExponent = RTFLOAT128U_EXP_BIAS_ADJUST - uExponent + 1;
6645
6646 pFpuResTwo->r80Result1.s.fSign = pr80Val->s.fSign;
6647 pFpuResTwo->r80Result1.s.uMantissa = uMantissa;
6648 pFpuResTwo->r80Result1.s.uExponent = uExponent;
6649 }
6650
6651 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6652 fFsw |= X86_FSW_UE | X86_FSW_PE;
6653
6654 if ((fFcw & X86_FCW_UM) && (fFcw & X86_FCW_PM))
6655 {
6656 /* All the exceptions are masked. */
6657 }
6658 else
6659 {
6660 fFsw |= X86_FSW_ES | X86_FSW_B;
6661 }
6662 }
6663 else
6664 {
6665 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6666 pFpuResTwo->r80Result2 = *pr80Val;
6667 fFsw |= X86_FSW_ES | X86_FSW_B;
6668 }
6669 }
6670 else if (RTFLOAT80U_IS_QUIET_NAN(pr80Val) || RTFLOAT80U_IS_INDEFINITE(pr80Val))
6671 {
6672 pFpuResTwo->r80Result1 = *pr80Val;
6673 pFpuResTwo->r80Result2 = *pr80Val;
6674 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6675 }
6676 else if (RTFLOAT80U_IS_UNNORMAL(pr80Val) || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
6677 {
6678 if (fFcw & X86_FCW_IM)
6679 {
6680 pFpuResTwo->r80Result1 = g_r80Indefinite;
6681 pFpuResTwo->r80Result2 = g_r80Indefinite;
6682 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6683 }
6684 else
6685 {
6686 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6687 pFpuResTwo->r80Result2 = *pr80Val;
6688 }
6689
6690 fFsw |= X86_FSW_IE;
6691 if (!(fFcw & X86_FCW_IM))
6692 fFsw |= X86_FSW_ES | X86_FSW_B;
6693 }
6694 else if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
6695 {
6696 pFpuResTwo->r80Result1 = *pr80Val;
6697 pFpuResTwo->r80Result2 = *pr80Val;
6698
6699 if (fFcw & X86_FCW_IM)
6700 {
6701 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
6702 pFpuResTwo->r80Result2.s.uMantissa |= RT_BIT_64(62);
6703 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6704 }
6705 else
6706 {
6707 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6708 pFpuResTwo->r80Result2 = *pr80Val;
6709 }
6710
6711 fFsw |= X86_FSW_IE;
6712 if (!(fFcw & X86_FCW_IM))
6713 fFsw |= X86_FSW_ES | X86_FSW_B;
6714 }
6715 else if (RTFLOAT80U_IS_INF(pr80Val))
6716 {
6717 if (fFcw & X86_FCW_IM)
6718 {
6719 pFpuResTwo->r80Result1 = g_r80Indefinite;
6720 pFpuResTwo->r80Result2 = g_r80Indefinite;
6721 fFsw &= ~X86_FSW_TOP_MASK | (6 << X86_FSW_TOP_SHIFT);
6722 }
6723 else
6724 {
6725 pFpuResTwo->r80Result1 = g_ar80Zero[0];
6726 pFpuResTwo->r80Result2 = *pr80Val;
6727 }
6728
6729 fFsw |= X86_FSW_IE;
6730 if (!(fFcw & X86_FCW_IM))
6731 fFsw |= X86_FSW_ES | X86_FSW_B;
6732 }
6733
6734 pFpuResTwo->FSW = fFsw;
6735}
6736#endif /* IEM_WITHOUT_ASSEMBLY */
6737
6738IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6739{
6740 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6741}
6742
6743IEM_DECL_IMPL_DEF(void, iemAImpl_fsincos_r80_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
6744{
6745 iemAImpl_fsincos_r80_r80(pFpuState, pFpuResTwo, pr80Val);
6746}
6747
6748#ifdef IEM_WITHOUT_ASSEMBLY
6749
6750
6751/*********************************************************************************************************************************
6752* x87 FPU Compare and Testing Operations *
6753*********************************************************************************************************************************/
6754
6755IEM_DECL_IMPL_DEF(void, iemAImpl_ftst_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6756{
6757 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6758
6759 if (RTFLOAT80U_IS_ZERO(pr80Val))
6760 fFsw |= X86_FSW_C3;
6761 else if (RTFLOAT80U_IS_NORMAL(pr80Val) || RTFLOAT80U_IS_INF(pr80Val))
6762 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 : 0;
6763 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6764 {
6765 fFsw |= pr80Val->s.fSign ? X86_FSW_C0 | X86_FSW_DE : X86_FSW_DE;
6766 if (!(pFpuState->FCW & X86_FCW_DM))
6767 fFsw |= X86_FSW_ES | X86_FSW_B;
6768 }
6769 else
6770 {
6771 fFsw |= X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6772 if (!(pFpuState->FCW & X86_FCW_IM))
6773 fFsw |= X86_FSW_ES | X86_FSW_B;
6774 }
6775
6776 *pu16Fsw = fFsw;
6777}
6778
6779
6780IEM_DECL_IMPL_DEF(void, iemAImpl_fxam_r80,(PCX86FXSTATE pFpuState, uint16_t *pu16Fsw, PCRTFLOAT80U pr80Val))
6781{
6782 RT_NOREF(pFpuState);
6783 uint16_t fFsw = (7 << X86_FSW_TOP_SHIFT);
6784
6785 /* C1 = sign bit (always, even if empty Intel says). */
6786 if (pr80Val->s.fSign)
6787 fFsw |= X86_FSW_C1;
6788
6789 /* Classify the value in C0, C2, C3. */
6790 if (!(pFpuState->FTW & RT_BIT_32(X86_FSW_TOP_GET(pFpuState->FSW))))
6791 fFsw |= X86_FSW_C0 | X86_FSW_C3; /* empty */
6792 else if (RTFLOAT80U_IS_NORMAL(pr80Val))
6793 fFsw |= X86_FSW_C2;
6794 else if (RTFLOAT80U_IS_ZERO(pr80Val))
6795 fFsw |= X86_FSW_C3;
6796 else if (RTFLOAT80U_IS_QUIET_OR_SIGNALLING_NAN(pr80Val))
6797 fFsw |= X86_FSW_C0;
6798 else if (RTFLOAT80U_IS_INF(pr80Val))
6799 fFsw |= X86_FSW_C0 | X86_FSW_C2;
6800 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
6801 fFsw |= X86_FSW_C2 | X86_FSW_C3;
6802 /* whatever else: 0 */
6803
6804 *pu16Fsw = fFsw;
6805}
6806
6807
6808/**
6809 * Worker for fcom, fucom, and friends.
6810 */
6811static uint16_t iemAImpl_fcom_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6812 uint16_t fFcw, uint16_t fFsw, bool fIeOnAllNaNs)
6813{
6814 /*
6815 * Unpack the values.
6816 */
6817 bool const fSign1 = pr80Val1->s.fSign;
6818 int32_t iExponent1 = pr80Val1->s.uExponent;
6819 uint64_t uMantissa1 = pr80Val1->s.uMantissa;
6820
6821 bool const fSign2 = pr80Val2->s.fSign;
6822 int32_t iExponent2 = pr80Val2->s.uExponent;
6823 uint64_t uMantissa2 = pr80Val2->s.uMantissa;
6824
6825 /*
6826 * Check for invalid inputs.
6827 */
6828 if ( RTFLOAT80U_IS_387_INVALID_EX(uMantissa1, iExponent1)
6829 || RTFLOAT80U_IS_387_INVALID_EX(uMantissa2, iExponent2))
6830 {
6831 if (!(fFcw & X86_FCW_IM))
6832 fFsw |= X86_FSW_ES | X86_FSW_B;
6833 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3 | X86_FSW_IE;
6834 }
6835
6836 /*
6837 * Check for NaNs and indefinites, they are all unordered and trumps #DE.
6838 */
6839 if ( RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6840 || RTFLOAT80U_IS_INDEFINITE_OR_QUIET_OR_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6841 {
6842 if ( fIeOnAllNaNs
6843 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa1, iExponent1)
6844 || RTFLOAT80U_IS_SIGNALLING_NAN_EX(uMantissa2, iExponent2))
6845 {
6846 fFsw |= X86_FSW_IE;
6847 if (!(fFcw & X86_FCW_IM))
6848 fFsw |= X86_FSW_ES | X86_FSW_B;
6849 }
6850 return fFsw | X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3;
6851 }
6852
6853 /*
6854 * Normalize the values.
6855 */
6856 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6857 {
6858 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa1, iExponent1))
6859 iExponent1 = 1;
6860 else
6861 {
6862 iExponent1 = 64 - ASMBitLastSetU64(uMantissa1);
6863 uMantissa1 <<= iExponent1;
6864 iExponent1 = 1 - iExponent1;
6865 }
6866 fFsw |= X86_FSW_DE;
6867 if (!(fFcw & X86_FCW_DM))
6868 fFsw |= X86_FSW_ES | X86_FSW_B;
6869 }
6870
6871 if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6872 {
6873 if (RTFLOAT80U_IS_PSEUDO_DENORMAL_EX(uMantissa2, iExponent2))
6874 iExponent2 = 1;
6875 else
6876 {
6877 iExponent2 = 64 - ASMBitLastSetU64(uMantissa2);
6878 uMantissa2 <<= iExponent2;
6879 iExponent2 = 1 - iExponent2;
6880 }
6881 fFsw |= X86_FSW_DE;
6882 if (!(fFcw & X86_FCW_DM))
6883 fFsw |= X86_FSW_ES | X86_FSW_B;
6884 }
6885
6886 /*
6887 * Test if equal (val1 == val2):
6888 */
6889 if ( uMantissa1 == uMantissa2
6890 && iExponent1 == iExponent2
6891 && ( fSign1 == fSign2
6892 || (uMantissa1 == 0 && iExponent1 == 0) /* ignore sign for zero */ ) )
6893 fFsw |= X86_FSW_C3;
6894 /*
6895 * Test if less than (val1 < val2):
6896 */
6897 else if (fSign1 && !fSign2)
6898 fFsw |= X86_FSW_C0;
6899 else if (fSign1 == fSign2)
6900 {
6901 /* Zeros are problematic, however at the most one can be zero here. */
6902 if (RTFLOAT80U_IS_ZERO_EX(uMantissa1, iExponent1))
6903 return !fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6904 if (RTFLOAT80U_IS_ZERO_EX(uMantissa2, iExponent2))
6905 return fSign1 ? fFsw | X86_FSW_C0 : fFsw;
6906
6907 if ( fSign1
6908 ^ ( iExponent1 < iExponent2
6909 || ( iExponent1 == iExponent2
6910 && uMantissa1 < uMantissa2 ) ) )
6911 fFsw |= X86_FSW_C0;
6912 }
6913 /* else: No flags set if greater. */
6914
6915 return fFsw;
6916}
6917
6918
6919IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6920 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6921{
6922 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6923}
6924
6925
6926
6927
6928IEM_DECL_IMPL_DEF(void, iemAImpl_fucom_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6929 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
6930{
6931 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, 6 << X86_FSW_TOP_SHIFT, false /*fIeOnAllNaNs*/);
6932}
6933
6934
6935IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r64,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6936 PCRTFLOAT80U pr80Val1, PCRTFLOAT64U pr64Val2))
6937{
6938 RTFLOAT80U r80Val2;
6939 uint16_t fFsw = iemAImplConvertR64ToR80(pr64Val2, &r80Val2);
6940 Assert(!fFsw || fFsw == X86_FSW_DE);
6941 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6942 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6943 {
6944 if (!(pFpuState->FCW & X86_FCW_DM))
6945 fFsw |= X86_FSW_ES | X86_FSW_B;
6946 *pfFsw |= fFsw;
6947 }
6948}
6949
6950
6951IEM_DECL_IMPL_DEF(void, iemAImpl_fcom_r80_by_r32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6952 PCRTFLOAT80U pr80Val1, PCRTFLOAT32U pr32Val2))
6953{
6954 RTFLOAT80U r80Val2;
6955 uint16_t fFsw = iemAImplConvertR32ToR80(pr32Val2, &r80Val2);
6956 Assert(!fFsw || fFsw == X86_FSW_DE);
6957 *pfFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, &r80Val2, pFpuState->FCW, 7 << X86_FSW_TOP_SHIFT, true /*fIeOnAllNaNs*/);
6958 if (fFsw != 0 && !(*pfFsw & X86_FSW_IE))
6959 {
6960 if (!(pFpuState->FCW & X86_FCW_DM))
6961 fFsw |= X86_FSW_ES | X86_FSW_B;
6962 *pfFsw |= fFsw;
6963 }
6964}
6965
6966
6967IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i32,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6968 PCRTFLOAT80U pr80Val1, int32_t const *pi32Val2))
6969{
6970 RTFLOAT80U r80Val2;
6971 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI32ToR80(*pi32Val2, &r80Val2));
6972 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6973}
6974
6975
6976IEM_DECL_IMPL_DEF(void, iemAImpl_ficom_r80_by_i16,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
6977 PCRTFLOAT80U pr80Val1, int16_t const *pi16Val2))
6978{
6979 RTFLOAT80U r80Val2;
6980 iemAImpl_fcom_r80_by_r80(pFpuState, pfFsw, pr80Val1, iemAImplConvertI16ToR80(*pi16Val2, &r80Val2));
6981 *pfFsw = (*pfFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
6982}
6983
6984
6985/**
6986 * Worker for fcomi & fucomi.
6987 */
6988static uint32_t iemAImpl_fcomi_r80_by_r80_worker(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2,
6989 uint16_t fFcw, uint16_t fFswIn, bool fIeOnAllNaNs, uint16_t *pfFsw)
6990{
6991 uint16_t fFsw = iemAImpl_fcom_r80_by_r80_worker(pr80Val1, pr80Val2, fFcw, 6 << X86_FSW_TOP_SHIFT, fIeOnAllNaNs);
6992 uint32_t fEflags = ((fFsw & X86_FSW_C3) >> (X86_FSW_C3_BIT - X86_EFL_ZF_BIT))
6993 | ((fFsw & X86_FSW_C2) >> (X86_FSW_C2_BIT - X86_EFL_PF_BIT))
6994 | ((fFsw & X86_FSW_C0) >> (X86_FSW_C0_BIT - X86_EFL_CF_BIT));
6995
6996 /* Note! C1 is not cleared as per docs! Everything is preserved. */
6997 *pfFsw = (fFsw & ~X86_FSW_C_MASK) | (fFswIn & X86_FSW_C_MASK);
6998 return fEflags | X86_EFL_IF | X86_EFL_RA1_MASK;
6999}
7000
7001
7002IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fcomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7003 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7004{
7005 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, true /*fIeOnAllNaNs*/, pfFsw);
7006}
7007
7008
7009IEM_DECL_IMPL_DEF(uint32_t, iemAImpl_fucomi_r80_by_r80,(PCX86FXSTATE pFpuState, uint16_t *pfFsw,
7010 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7011{
7012 return iemAImpl_fcomi_r80_by_r80_worker(pr80Val1, pr80Val2, pFpuState->FCW, pFpuState->FSW, false /*fIeOnAllNaNs*/, pfFsw);
7013}
7014
7015
7016/*********************************************************************************************************************************
7017* x87 FPU Other Operations *
7018*********************************************************************************************************************************/
7019
7020/**
7021 * Helper for iemAImpl_frndint_r80, called both on normal and denormal numbers.
7022 */
7023static uint16_t iemAImpl_frndint_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7024{
7025 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7026 iemFpuSoftF80ToIprt(pr80Result, extF80_roundToInt(iemFpuSoftF80FromIprt(pr80Val), SoftState.roundingMode,
7027 true /*exact / generate #PE */, &SoftState));
7028 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7029}
7030
7031
7032IEM_DECL_IMPL_DEF(void, iemAImpl_frndint_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7033{
7034 uint16_t const fFcw = pFpuState->FCW;
7035 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7036
7037 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7038 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7039 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7040 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7041 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7042 || RTFLOAT80U_IS_INF(pr80Val))
7043 pFpuRes->r80Result = *pr80Val;
7044 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7045 {
7046 fFsw |= X86_FSW_DE;
7047 if (fFcw & X86_FCW_DM)
7048 fFsw = iemAImpl_frndint_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7049 else
7050 {
7051 pFpuRes->r80Result = *pr80Val;
7052 fFsw |= X86_FSW_ES | X86_FSW_B;
7053 }
7054 }
7055 else
7056 {
7057 if (fFcw & X86_FCW_IM)
7058 {
7059 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7060 pFpuRes->r80Result = g_r80Indefinite;
7061 else
7062 {
7063 pFpuRes->r80Result = *pr80Val;
7064 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7065 }
7066 }
7067 else
7068 {
7069 pFpuRes->r80Result = *pr80Val;
7070 fFsw |= X86_FSW_ES | X86_FSW_B;
7071 }
7072 fFsw |= X86_FSW_IE;
7073 }
7074 pFpuRes->FSW = fFsw;
7075}
7076
7077
7078IEM_DECL_IMPL_DEF(void, iemAImpl_fscale_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7079 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7080{
7081 /* The SoftFloat worker function extF80_scale_extF80 is of our creation, so
7082 it does everything we need it to do. */
7083 uint16_t const fFcw = pFpuState->FCW;
7084 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7085 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7086 extFloat80_t r80XResult = extF80_scale_extF80(iemFpuSoftF80FromIprt(pr80Val1), iemFpuSoftF80FromIprt(pr80Val2), &SoftState);
7087 pFpuRes->FSW = iemFpuSoftStateAndF80ToFswAndIprtResult(&SoftState, r80XResult, &pFpuRes->r80Result, fFcw, fFsw, pr80Val1);
7088}
7089
7090
7091/**
7092 * Helper for iemAImpl_fsqrt_r80, called both on normal and denormal numbers.
7093 */
7094static uint16_t iemAImpl_fsqrt_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7095{
7096 Assert(!pr80Val->s.fSign);
7097 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_FCW(fFcw);
7098 iemFpuSoftF80ToIprt(pr80Result, extF80_sqrt(iemFpuSoftF80FromIprt(pr80Val), &SoftState));
7099 return IEM_SOFTFLOAT_STATE_TO_FSW(fFsw, &SoftState, fFcw);
7100}
7101
7102
7103IEM_DECL_IMPL_DEF(void, iemAImpl_fsqrt_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7104{
7105 uint16_t const fFcw = pFpuState->FCW;
7106 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7107
7108 if (RTFLOAT80U_IS_NORMAL(pr80Val) && !pr80Val->s.fSign)
7109 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7110 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7111 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7112 || RTFLOAT80U_IS_INDEFINITE(pr80Val)
7113 || (RTFLOAT80U_IS_INF(pr80Val) && !pr80Val->s.fSign))
7114 pFpuRes->r80Result = *pr80Val;
7115 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val) && !pr80Val->s.fSign) /* Negative denormals only generate #IE! */
7116 {
7117 fFsw |= X86_FSW_DE;
7118 if (fFcw & X86_FCW_DM)
7119 fFsw = iemAImpl_fsqrt_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7120 else
7121 {
7122 pFpuRes->r80Result = *pr80Val;
7123 fFsw |= X86_FSW_ES | X86_FSW_B;
7124 }
7125 }
7126 else
7127 {
7128 if (fFcw & X86_FCW_IM)
7129 {
7130 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7131 pFpuRes->r80Result = g_r80Indefinite;
7132 else
7133 {
7134 pFpuRes->r80Result = *pr80Val;
7135 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7136 }
7137 }
7138 else
7139 {
7140 pFpuRes->r80Result = *pr80Val;
7141 fFsw |= X86_FSW_ES | X86_FSW_B;
7142 }
7143 fFsw |= X86_FSW_IE;
7144 }
7145 pFpuRes->FSW = fFsw;
7146}
7147
7148
7149/**
7150 * @code{.unparsed}
7151 * x x * ln2
7152 * f(x) = 2 - 1 = e - 1
7153 *
7154 * @endcode
7155 *
7156 * We can approximate e^x by a Taylor/Maclaurin series (see
7157 * https://en.wikipedia.org/wiki/Taylor_series#Exponential_function):
7158 * @code{.unparsed}
7159 * n 0 1 2 3 4
7160 * inf x x x x x x
7161 * SUM ----- = --- + --- + --- + --- + --- + ...
7162 * n=0 n! 0! 1! 2! 3! 4!
7163 *
7164 * 2 3 4
7165 * x x x
7166 * = 1 + x + --- + --- + --- + ...
7167 * 2! 3! 4!
7168 * @endcode
7169 *
7170 * Given z = x * ln2, we get:
7171 * @code{.unparsed}
7172 * 2 3 4 n
7173 * z z z z z
7174 * e - 1 = z + --- + --- + --- + ... + ---
7175 * 2! 3! 4! n!
7176 * @endcode
7177 *
7178 * Wanting to use Horner's method, we move one z outside and get:
7179 * @code{.unparsed}
7180 * 2 3 (n-1)
7181 * z z z z
7182 * = z ( 1 + --- + --- + --- + ... + ------- )
7183 * 2! 3! 4! n!
7184 * @endcode
7185 *
7186 * The constants we need for using Horner's methods are 1 and 1 / n!.
7187 *
7188 * For very tiny x values, we can get away with f(x) = x * ln 2, because
7189 * because we don't have the necessary precision to represent 1.0 + z/3 + ...
7190 * and can approximate it to be 1.0. For a visual demonstration of this
7191 * check out https://www.desmos.com/calculator/vidcdxizd9 (for as long
7192 * as it valid), plotting f(x) = 2^x - 1 and f(x) = x * ln2.
7193 *
7194 *
7195 * As constant accuracy goes, figure 0.1 "80387 Block Diagram" in the "80387
7196 * Data Sheet" (order 231920-002; Appendix E in 80387 PRM 231917-001; Military
7197 * i387SX 271166-002), indicates that constants are 67-bit (constant rom block)
7198 * and the internal mantissa size is 68-bit (mantissa adder & barrel shifter
7199 * blocks). (The one bit difference is probably an implicit one missing from
7200 * the constant ROM.) A paper on division and sqrt on the AMD-K7 by Stuart F.
7201 * Oberman states that it internally used a 68 bit mantissa with a 18-bit
7202 * exponent.
7203 *
7204 * However, even when sticking to 67 constants / 68 mantissas, I have not yet
7205 * successfully reproduced the exact results from an Intel 10980XE, there is
7206 * always a portition of rounding differences. Not going to spend too much time
7207 * on getting this 100% the same, at least not now.
7208 *
7209 * P.S. If someone are really curious about 8087 and its contstants:
7210 * http://www.righto.com/2020/05/extracting-rom-constants-from-8087-math.html
7211 *
7212 *
7213 * @param pr80Val The exponent value (x), less than 1.0, greater than
7214 * -1.0 and not zero. This can be a normal, denormal
7215 * or pseudo-denormal value.
7216 * @param pr80Result Where to return the result.
7217 * @param fFcw FPU control word.
7218 * @param fFsw FPU status word.
7219 */
7220static uint16_t iemAImpl_f2xm1_r80_normal(PCRTFLOAT80U pr80Val, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7221{
7222 /* As mentioned above, we can skip the expensive polynomial calculation
7223 as it will be close enough to 1.0 that it makes no difference.
7224
7225 The cutoff point for intel 10980XE is exponents >= -69. Intel
7226 also seems to be using a 67-bit or 68-bit constant value, and we get
7227 a smattering of rounding differences if we go for higher precision. */
7228 if (pr80Val->s.uExponent <= RTFLOAT80U_EXP_BIAS - 69)
7229 {
7230 RTUINT256U u256;
7231 RTUInt128MulByU64Ex(&u256, &g_u128Ln2MantissaIntel, pr80Val->s.uMantissa);
7232 u256.QWords.qw0 |= 1; /* force #PE */
7233 fFsw = iemFpuFloat80RoundAndComposeFrom192(pr80Result, pr80Val->s.fSign, &u256,
7234 !RTFLOAT80U_IS_PSEUDO_DENORMAL(pr80Val) && !RTFLOAT80U_IS_DENORMAL(pr80Val)
7235 ? (int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS
7236 : 1 - RTFLOAT80U_EXP_BIAS,
7237 fFcw, fFsw);
7238 }
7239 else
7240 {
7241#ifdef IEM_WITH_FLOAT128_FOR_FPU
7242 /* This approach is not good enough for small values, we end up with zero. */
7243 int const fOldRounding = iemFpuF128SetRounding(fFcw);
7244 _Float128 rd128Val = iemFpuF128FromFloat80(pr80Val, fFcw);
7245 _Float128 rd128Result = powf128(2.0L, rd128Val);
7246 rd128Result -= 1.0L;
7247 fFsw = iemFpuF128ToFloat80(pr80Result, rd128Result, fFcw, fFsw);
7248 iemFpuF128RestoreRounding(fOldRounding);
7249
7250# else
7251 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7252 float128_t const x = iemFpuSoftF128FromFloat80(pr80Val);
7253
7254 /* As mentioned above, enforce 68-bit internal mantissa width to better
7255 match the Intel 10980XE results. */
7256 unsigned const cPrecision = 68;
7257
7258 /* first calculate z = x * ln2 */
7259 float128_t z = iemFpuSoftF128Precision(f128_mul(x, iemFpuSoftF128PrecisionIprt(&g_r128Ln2, cPrecision), &SoftState),
7260 cPrecision);
7261
7262 /* Then do the polynomial evaluation. */
7263 float128_t r = iemFpuSoftF128HornerPoly(z, g_ar128F2xm1HornerConsts, RT_ELEMENTS(g_ar128F2xm1HornerConsts),
7264 cPrecision, &SoftState);
7265 r = f128_mul(z, r, &SoftState);
7266
7267 /* Output the result. */
7268 fFsw = iemFpuSoftF128ToFloat80(pr80Result, r, fFcw, fFsw);
7269# endif
7270 }
7271 return fFsw;
7272}
7273
7274
7275IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7276{
7277 uint16_t const fFcw = pFpuState->FCW;
7278 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7279
7280 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7281 {
7282 if (pr80Val->s.uExponent < RTFLOAT80U_EXP_BIAS)
7283 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7284 else
7285 {
7286 /* Special case:
7287 2^+1.0 - 1.0 = 1.0
7288 2^-1.0 - 1.0 = -0.5 */
7289 if ( pr80Val->s.uExponent == RTFLOAT80U_EXP_BIAS
7290 && pr80Val->s.uMantissa == RT_BIT_64(63))
7291 {
7292 pFpuRes->r80Result.s.uMantissa = RT_BIT_64(63);
7293 pFpuRes->r80Result.s.uExponent = RTFLOAT80U_EXP_BIAS - pr80Val->s.fSign;
7294 pFpuRes->r80Result.s.fSign = pr80Val->s.fSign;
7295 }
7296 /* ST(0) > 1.0 || ST(0) < -1.0: undefined behavior */
7297 /** @todo 287 is documented to only accept values 0 <= ST(0) <= 0.5. */
7298 else
7299 pFpuRes->r80Result = *pr80Val;
7300 fFsw |= X86_FSW_PE;
7301 if (!(fFcw & X86_FCW_PM))
7302 fFsw |= X86_FSW_ES | X86_FSW_B;
7303 }
7304 }
7305 else if ( RTFLOAT80U_IS_ZERO(pr80Val)
7306 || RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7307 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7308 pFpuRes->r80Result = *pr80Val;
7309 else if (RTFLOAT80U_IS_INF(pr80Val))
7310 pFpuRes->r80Result = pr80Val->s.fSign ? g_ar80One[1] : *pr80Val;
7311 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7312 {
7313 fFsw |= X86_FSW_DE;
7314 if (fFcw & X86_FCW_DM)
7315 fFsw = iemAImpl_f2xm1_r80_normal(pr80Val, &pFpuRes->r80Result, fFcw, fFsw);
7316 else
7317 {
7318 pFpuRes->r80Result = *pr80Val;
7319 fFsw |= X86_FSW_ES | X86_FSW_B;
7320 }
7321 }
7322 else
7323 {
7324 if ( ( RTFLOAT80U_IS_UNNORMAL(pr80Val)
7325 || RTFLOAT80U_IS_PSEUDO_NAN(pr80Val))
7326 && (fFcw & X86_FCW_IM))
7327 pFpuRes->r80Result = g_r80Indefinite;
7328 else
7329 {
7330 pFpuRes->r80Result = *pr80Val;
7331 if (RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val) && (fFcw & X86_FCW_IM))
7332 pFpuRes->r80Result.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7333 }
7334 fFsw |= X86_FSW_IE;
7335 if (!(fFcw & X86_FCW_IM))
7336 fFsw |= X86_FSW_ES | X86_FSW_B;
7337 }
7338 pFpuRes->FSW = fFsw;
7339}
7340
7341#endif /* IEM_WITHOUT_ASSEMBLY */
7342
7343IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7344{
7345 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7346}
7347
7348IEM_DECL_IMPL_DEF(void, iemAImpl_f2xm1_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7349{
7350 iemAImpl_f2xm1_r80(pFpuState, pFpuRes, pr80Val);
7351}
7352
7353#ifdef IEM_WITHOUT_ASSEMBLY
7354
7355IEM_DECL_IMPL_DEF(void, iemAImpl_fabs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7356{
7357 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7358 pFpuRes->r80Result = *pr80Val;
7359 pFpuRes->r80Result.s.fSign = 0;
7360}
7361
7362
7363IEM_DECL_IMPL_DEF(void, iemAImpl_fchs_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes, PCRTFLOAT80U pr80Val))
7364{
7365 pFpuRes->FSW = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (7 << X86_FSW_TOP_SHIFT);
7366 pFpuRes->r80Result = *pr80Val;
7367 pFpuRes->r80Result.s.fSign = !pr80Val->s.fSign;
7368}
7369
7370
7371IEM_DECL_IMPL_DEF(void, iemAImpl_fxtract_r80_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULTTWO pFpuResTwo, PCRTFLOAT80U pr80Val))
7372{
7373 uint16_t const fFcw = pFpuState->FCW;
7374 uint16_t fFsw = (pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3)) | (6 << X86_FSW_TOP_SHIFT);
7375
7376 if (RTFLOAT80U_IS_NORMAL(pr80Val))
7377 {
7378 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7379 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80((int32_t)pr80Val->s.uExponent - RTFLOAT80U_EXP_BIAS, &Ignored));
7380
7381 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7382 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7383 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7384 }
7385 else if (RTFLOAT80U_IS_ZERO(pr80Val))
7386 {
7387 fFsw |= X86_FSW_ZE;
7388 if (fFcw & X86_FCW_ZM)
7389 {
7390 pFpuResTwo->r80Result1 = g_ar80Infinity[1];
7391 pFpuResTwo->r80Result2 = *pr80Val;
7392 }
7393 else
7394 {
7395 pFpuResTwo->r80Result2 = *pr80Val;
7396 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7397 }
7398 }
7399 else if (RTFLOAT80U_IS_DENORMAL_OR_PSEUDO_DENORMAL(pr80Val))
7400 {
7401 fFsw |= X86_FSW_DE;
7402 if (fFcw & X86_FCW_DM)
7403 {
7404 pFpuResTwo->r80Result2.s.fSign = pr80Val->s.fSign;
7405 pFpuResTwo->r80Result2.s.uExponent = RTFLOAT80U_EXP_BIAS;
7406 pFpuResTwo->r80Result2.s.uMantissa = pr80Val->s.uMantissa;
7407 int32_t iExponent = -16382;
7408 while (!(pFpuResTwo->r80Result2.s.uMantissa & RT_BIT_64(63)))
7409 {
7410 pFpuResTwo->r80Result2.s.uMantissa <<= 1;
7411 iExponent--;
7412 }
7413
7414 softfloat_state_t Ignored = SOFTFLOAT_STATE_INIT_DEFAULTS();
7415 iemFpuSoftF80ToIprt(&pFpuResTwo->r80Result1, i32_to_extF80(iExponent, &Ignored));
7416 }
7417 else
7418 {
7419 pFpuResTwo->r80Result2 = *pr80Val;
7420 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7421 }
7422 }
7423 else if ( RTFLOAT80U_IS_QUIET_NAN(pr80Val)
7424 || RTFLOAT80U_IS_INDEFINITE(pr80Val))
7425 {
7426 pFpuResTwo->r80Result1 = *pr80Val;
7427 pFpuResTwo->r80Result2 = *pr80Val;
7428 }
7429 else if (RTFLOAT80U_IS_INF(pr80Val))
7430 {
7431 pFpuResTwo->r80Result1 = g_ar80Infinity[0];
7432 pFpuResTwo->r80Result2 = *pr80Val;
7433 }
7434 else
7435 {
7436 if (fFcw & X86_FCW_IM)
7437 {
7438 if (!RTFLOAT80U_IS_SIGNALLING_NAN(pr80Val))
7439 pFpuResTwo->r80Result1 = g_r80Indefinite;
7440 else
7441 {
7442 pFpuResTwo->r80Result1 = *pr80Val;
7443 pFpuResTwo->r80Result1.s.uMantissa |= RT_BIT_64(62); /* make it quiet */
7444 }
7445 pFpuResTwo->r80Result2 = pFpuResTwo->r80Result1;
7446 }
7447 else
7448 {
7449 pFpuResTwo->r80Result2 = *pr80Val;
7450 fFsw = X86_FSW_ES | X86_FSW_B | (fFsw & ~X86_FSW_TOP_MASK) | (7 << X86_FSW_TOP_SHIFT);
7451 }
7452 fFsw |= X86_FSW_IE;
7453 }
7454 pFpuResTwo->FSW = fFsw;
7455}
7456#endif /* IEM_WITHOUT_ASSEMBLY */
7457
7458#if defined(IEM_WITHOUT_ASSEMBLY)
7459
7460static uint16_t iemAImpl_fyl2x_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7461{
7462 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7463 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7464 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7465 extFloat80_t v;
7466 (void)fFcw;
7467
7468 v = extF80_ylog2x(y, x, &SoftState);
7469 iemFpuSoftF80ToIprt(pr80Result, v);
7470
7471 return fFsw;
7472}
7473
7474IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7475 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7476{
7477 uint16_t const fFcw = pFpuState->FCW;
7478 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7479
7480 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && !pr80Val2->s.fSign)
7481 {
7482 fFsw |= iemAImpl_fyl2x_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7483
7484 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7485 if (!(fFcw & X86_FCW_PM))
7486 fFsw |= X86_FSW_ES | X86_FSW_B;
7487 }
7488 else
7489 {
7490 fFsw |= X86_FSW_IE;
7491
7492 if (!(fFcw & X86_FCW_IM))
7493 {
7494 pFpuRes->r80Result = *pr80Val2;
7495 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7496 }
7497 else
7498 {
7499 pFpuRes->r80Result = g_r80Indefinite;
7500 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7501 }
7502 }
7503
7504 pFpuRes->FSW = fFsw;
7505}
7506#endif /* IEM_WITHOUT_ASSEMBLY */
7507
7508IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7509 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7510{
7511 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7512}
7513
7514IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2x_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7515 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7516{
7517 iemAImpl_fyl2x_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7518}
7519
7520#if defined(IEM_WITHOUT_ASSEMBLY)
7521
7522static uint16_t iemAImpl_fyl2xp1_r80_by_r80_normal(PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2, PRTFLOAT80U pr80Result, uint16_t fFcw, uint16_t fFsw)
7523{
7524 softfloat_state_t SoftState = SOFTFLOAT_STATE_INIT_DEFAULTS();
7525 extFloat80_t y = iemFpuSoftF80FromIprt(pr80Val1);
7526 extFloat80_t x = iemFpuSoftF80FromIprt(pr80Val2);
7527 extFloat80_t v;
7528 (void)fFcw;
7529
7530 v = extF80_ylog2xp1(y, x, &SoftState);
7531 iemFpuSoftF80ToIprt(pr80Result, v);
7532
7533 return fFsw;
7534}
7535
7536IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7537 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7538{
7539 uint16_t const fFcw = pFpuState->FCW;
7540 uint16_t fFsw = pFpuState->FSW & (X86_FSW_C0 | X86_FSW_C2 | X86_FSW_C3);
7541
7542 if (RTFLOAT80U_IS_NORMAL(pr80Val1) && RTFLOAT80U_IS_NORMAL(pr80Val2) && pr80Val2->s.uExponent < RTFLOAT80U_EXP_BIAS)
7543 {
7544 fFsw = iemAImpl_fyl2xp1_r80_by_r80_normal(pr80Val1, pr80Val2, &pFpuRes->r80Result, fFcw, fFsw);
7545
7546 fFsw |= X86_FSW_PE | (7 << X86_FSW_TOP_SHIFT);
7547 if (!(fFcw & X86_FCW_PM))
7548 fFsw |= X86_FSW_ES | X86_FSW_B;
7549 }
7550 else
7551 {
7552 fFsw |= X86_FSW_IE;
7553
7554 if (!(fFcw & X86_FCW_IM))
7555 {
7556 pFpuRes->r80Result = *pr80Val2;
7557 fFsw |= X86_FSW_ES | X86_FSW_B | (6 << X86_FSW_TOP_SHIFT);
7558 }
7559 else
7560 {
7561 pFpuRes->r80Result = g_r80Indefinite;
7562 fFsw |= (7 << X86_FSW_TOP_SHIFT);
7563 }
7564 }
7565
7566 pFpuRes->FSW = fFsw;
7567}
7568
7569#endif /* IEM_WITHOUT_ASSEMBLY */
7570
7571IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_intel,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7572 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7573{
7574 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7575}
7576
7577IEM_DECL_IMPL_DEF(void, iemAImpl_fyl2xp1_r80_by_r80_amd,(PCX86FXSTATE pFpuState, PIEMFPURESULT pFpuRes,
7578 PCRTFLOAT80U pr80Val1, PCRTFLOAT80U pr80Val2))
7579{
7580 iemAImpl_fyl2xp1_r80_by_r80(pFpuState, pFpuRes, pr80Val1, pr80Val2);
7581}
7582
7583
7584/*********************************************************************************************************************************
7585* MMX, SSE & AVX *
7586*********************************************************************************************************************************/
7587
7588#ifdef IEM_WITH_VEX
7589
7590/*
7591 * VMOVSLDUP
7592 */
7593IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7594{
7595 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[0];
7596 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[0];
7597 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[2];
7598 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[2];
7599 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7600 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[0];
7601 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7602 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[2];
7603}
7604
7605
7606IEM_DECL_IMPL_DEF(void, iemAImpl_vmovsldup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7607{
7608 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[0];
7609 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[0];
7610 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[2];
7611 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[2];
7612 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[4];
7613 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[4];
7614 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[6];
7615 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[6];
7616}
7617
7618#endif /* IEM_WITH_VEX */
7619
7620
7621#ifdef IEM_WITH_VEX
7622
7623/*
7624 * VMOVSHDUP
7625 */
7626IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7627{
7628 pXState->x87.aXMM[iYRegDst].au32[0] = pXState->x87.aXMM[iYRegSrc].au32[1];
7629 pXState->x87.aXMM[iYRegDst].au32[1] = pXState->x87.aXMM[iYRegSrc].au32[1];
7630 pXState->x87.aXMM[iYRegDst].au32[2] = pXState->x87.aXMM[iYRegSrc].au32[3];
7631 pXState->x87.aXMM[iYRegDst].au32[3] = pXState->x87.aXMM[iYRegSrc].au32[3];
7632 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7633 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[1];
7634 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7635 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au32[3];
7636}
7637
7638
7639IEM_DECL_IMPL_DEF(void, iemAImpl_vmovshdup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7640{
7641 pXState->x87.aXMM[iYRegDst].au32[0] = pSrc->au32[1];
7642 pXState->x87.aXMM[iYRegDst].au32[1] = pSrc->au32[1];
7643 pXState->x87.aXMM[iYRegDst].au32[2] = pSrc->au32[3];
7644 pXState->x87.aXMM[iYRegDst].au32[3] = pSrc->au32[3];
7645 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[0] = pSrc->au32[5];
7646 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[1] = pSrc->au32[5];
7647 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[2] = pSrc->au32[7];
7648 pXState->u.YmmHi.aYmmHi[iYRegDst].au32[3] = pSrc->au32[7];
7649}
7650
7651#endif /* IEM_WITH_VEX */
7652
7653
7654#ifdef IEM_WITH_VEX
7655
7656/*
7657 * VMOVDDUP
7658 */
7659IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rr,(PX86XSAVEAREA pXState, uint8_t iYRegDst, uint8_t iYRegSrc))
7660{
7661 pXState->x87.aXMM[iYRegDst].au64[0] = pXState->x87.aXMM[iYRegSrc].au64[0];
7662 pXState->x87.aXMM[iYRegDst].au64[1] = pXState->x87.aXMM[iYRegSrc].au64[0];
7663 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7664 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pXState->u.YmmHi.aYmmHi[iYRegSrc].au64[0];
7665}
7666
7667IEM_DECL_IMPL_DEF(void, iemAImpl_vmovddup_256_rm,(PX86XSAVEAREA pXState, uint8_t iYRegDst, PCRTUINT256U pSrc))
7668{
7669 pXState->x87.aXMM[iYRegDst].au64[0] = pSrc->au64[0];
7670 pXState->x87.aXMM[iYRegDst].au64[1] = pSrc->au64[0];
7671 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[0] = pSrc->au64[2];
7672 pXState->u.YmmHi.aYmmHi[iYRegDst].au64[1] = pSrc->au64[2];
7673}
7674
7675#endif /* IEM_WITH_VEX */
7676
7677
7678/*
7679 * PAND / VPAND / PANDPS / VPANDPS / PANDPD / VPANDPD
7680 */
7681#ifdef IEM_WITHOUT_ASSEMBLY
7682
7683IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7684{
7685 RT_NOREF(pFpuState);
7686 *puDst &= *puSrc;
7687}
7688
7689
7690IEM_DECL_IMPL_DEF(void, iemAImpl_pand_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7691{
7692 RT_NOREF(pFpuState);
7693 puDst->au64[0] &= puSrc->au64[0];
7694 puDst->au64[1] &= puSrc->au64[1];
7695}
7696
7697#endif
7698
7699IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7700 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7701{
7702 RT_NOREF(pExtState);
7703 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7704 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7705}
7706
7707
7708IEM_DECL_IMPL_DEF(void, iemAImpl_vpand_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7709 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7710{
7711 RT_NOREF(pExtState);
7712 puDst->au64[0] = puSrc1->au64[0] & puSrc2->au64[0];
7713 puDst->au64[1] = puSrc1->au64[1] & puSrc2->au64[1];
7714 puDst->au64[2] = puSrc1->au64[2] & puSrc2->au64[2];
7715 puDst->au64[3] = puSrc1->au64[3] & puSrc2->au64[3];
7716}
7717
7718
7719/*
7720 * PANDN / VPANDN / PANDNPS / VPANDNPS / PANDNPD / VPANDNPD
7721 */
7722#ifdef IEM_WITHOUT_ASSEMBLY
7723
7724IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7725{
7726 RT_NOREF(pFpuState);
7727 *puDst = ~*puDst & *puSrc;
7728}
7729
7730
7731IEM_DECL_IMPL_DEF(void, iemAImpl_pandn_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7732{
7733 RT_NOREF(pFpuState);
7734 puDst->au64[0] = ~puDst->au64[0] & puSrc->au64[0];
7735 puDst->au64[1] = ~puDst->au64[1] & puSrc->au64[1];
7736}
7737
7738#endif
7739
7740IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7741 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7742{
7743 RT_NOREF(pExtState);
7744 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7745 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7746}
7747
7748
7749IEM_DECL_IMPL_DEF(void, iemAImpl_vpandn_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7750 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7751{
7752 RT_NOREF(pExtState);
7753 puDst->au64[0] = ~puSrc1->au64[0] & puSrc2->au64[0];
7754 puDst->au64[1] = ~puSrc1->au64[1] & puSrc2->au64[1];
7755 puDst->au64[2] = ~puSrc1->au64[2] & puSrc2->au64[2];
7756 puDst->au64[3] = ~puSrc1->au64[3] & puSrc2->au64[3];
7757}
7758
7759
7760/*
7761 * POR / VPOR / PORPS / VPORPS / PORPD / VPORPD
7762 */
7763#ifdef IEM_WITHOUT_ASSEMBLY
7764
7765IEM_DECL_IMPL_DEF(void, iemAImpl_por_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7766{
7767 RT_NOREF(pFpuState);
7768 *puDst |= *puSrc;
7769}
7770
7771
7772IEM_DECL_IMPL_DEF(void, iemAImpl_por_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7773{
7774 RT_NOREF(pFpuState);
7775 puDst->au64[0] |= puSrc->au64[0];
7776 puDst->au64[1] |= puSrc->au64[1];
7777}
7778
7779#endif
7780
7781IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7782 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7783{
7784 RT_NOREF(pExtState);
7785 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7786 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7787}
7788
7789
7790IEM_DECL_IMPL_DEF(void, iemAImpl_vpor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7791 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7792{
7793 RT_NOREF(pExtState);
7794 puDst->au64[0] = puSrc1->au64[0] | puSrc2->au64[0];
7795 puDst->au64[1] = puSrc1->au64[1] | puSrc2->au64[1];
7796 puDst->au64[2] = puSrc1->au64[2] | puSrc2->au64[2];
7797 puDst->au64[3] = puSrc1->au64[3] | puSrc2->au64[3];
7798}
7799
7800
7801/*
7802 * PXOR / VPXOR / PXORPS / VPXORPS / PXORPD / VPXORPD
7803 */
7804#ifdef IEM_WITHOUT_ASSEMBLY
7805
7806IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7807{
7808 RT_NOREF(pFpuState);
7809 *puDst ^= *puSrc;
7810}
7811
7812
7813IEM_DECL_IMPL_DEF(void, iemAImpl_pxor_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7814{
7815 RT_NOREF(pFpuState);
7816 puDst->au64[0] ^= puSrc->au64[0];
7817 puDst->au64[1] ^= puSrc->au64[1];
7818}
7819
7820#endif
7821
7822IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7823 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7824{
7825 RT_NOREF(pExtState);
7826 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7827 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7828}
7829
7830
7831IEM_DECL_IMPL_DEF(void, iemAImpl_vpxor_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7832 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7833{
7834 RT_NOREF(pExtState);
7835 puDst->au64[0] = puSrc1->au64[0] ^ puSrc2->au64[0];
7836 puDst->au64[1] = puSrc1->au64[1] ^ puSrc2->au64[1];
7837 puDst->au64[2] = puSrc1->au64[2] ^ puSrc2->au64[2];
7838 puDst->au64[3] = puSrc1->au64[3] ^ puSrc2->au64[3];
7839}
7840
7841
7842/*
7843 * PCMPEQB / VPCMPEQB
7844 */
7845#ifdef IEM_WITHOUT_ASSEMBLY
7846
7847IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7848{
7849 RT_NOREF(pFpuState);
7850 RTUINT64U uSrc1 = { *puDst };
7851 RTUINT64U uSrc2 = { *puSrc };
7852 RTUINT64U uDst;
7853 uDst.au8[0] = uSrc1.au8[0] == uSrc2.au8[0] ? 0xff : 0;
7854 uDst.au8[1] = uSrc1.au8[1] == uSrc2.au8[1] ? 0xff : 0;
7855 uDst.au8[2] = uSrc1.au8[2] == uSrc2.au8[2] ? 0xff : 0;
7856 uDst.au8[3] = uSrc1.au8[3] == uSrc2.au8[3] ? 0xff : 0;
7857 uDst.au8[4] = uSrc1.au8[4] == uSrc2.au8[4] ? 0xff : 0;
7858 uDst.au8[5] = uSrc1.au8[5] == uSrc2.au8[5] ? 0xff : 0;
7859 uDst.au8[6] = uSrc1.au8[6] == uSrc2.au8[6] ? 0xff : 0;
7860 uDst.au8[7] = uSrc1.au8[7] == uSrc2.au8[7] ? 0xff : 0;
7861 *puDst = uDst.u;
7862}
7863
7864
7865IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7866{
7867 RT_NOREF(pFpuState);
7868 RTUINT128U uSrc1 = *puDst;
7869 puDst->au8[0] = uSrc1.au8[0] == puSrc->au8[0] ? UINT8_MAX : 0;
7870 puDst->au8[1] = uSrc1.au8[1] == puSrc->au8[1] ? UINT8_MAX : 0;
7871 puDst->au8[2] = uSrc1.au8[2] == puSrc->au8[2] ? UINT8_MAX : 0;
7872 puDst->au8[3] = uSrc1.au8[3] == puSrc->au8[3] ? UINT8_MAX : 0;
7873 puDst->au8[4] = uSrc1.au8[4] == puSrc->au8[4] ? UINT8_MAX : 0;
7874 puDst->au8[5] = uSrc1.au8[5] == puSrc->au8[5] ? UINT8_MAX : 0;
7875 puDst->au8[6] = uSrc1.au8[6] == puSrc->au8[6] ? UINT8_MAX : 0;
7876 puDst->au8[7] = uSrc1.au8[7] == puSrc->au8[7] ? UINT8_MAX : 0;
7877 puDst->au8[8] = uSrc1.au8[8] == puSrc->au8[8] ? UINT8_MAX : 0;
7878 puDst->au8[9] = uSrc1.au8[9] == puSrc->au8[9] ? UINT8_MAX : 0;
7879 puDst->au8[10] = uSrc1.au8[10] == puSrc->au8[10] ? UINT8_MAX : 0;
7880 puDst->au8[11] = uSrc1.au8[11] == puSrc->au8[11] ? UINT8_MAX : 0;
7881 puDst->au8[12] = uSrc1.au8[12] == puSrc->au8[12] ? UINT8_MAX : 0;
7882 puDst->au8[13] = uSrc1.au8[13] == puSrc->au8[13] ? UINT8_MAX : 0;
7883 puDst->au8[14] = uSrc1.au8[14] == puSrc->au8[14] ? UINT8_MAX : 0;
7884 puDst->au8[15] = uSrc1.au8[15] == puSrc->au8[15] ? UINT8_MAX : 0;
7885}
7886
7887#endif
7888
7889IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7890 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7891{
7892 RT_NOREF(pExtState);
7893 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7894 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7895 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7896 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7897 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7898 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7899 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7900 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7901 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7902 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7903 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7904 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7905 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7906 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7907 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7908 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7909}
7910
7911IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
7912 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
7913{
7914 RT_NOREF(pExtState);
7915 puDst->au8[0] = puSrc1->au8[0] == puSrc2->au8[0] ? UINT8_MAX : 0;
7916 puDst->au8[1] = puSrc1->au8[1] == puSrc2->au8[1] ? UINT8_MAX : 0;
7917 puDst->au8[2] = puSrc1->au8[2] == puSrc2->au8[2] ? UINT8_MAX : 0;
7918 puDst->au8[3] = puSrc1->au8[3] == puSrc2->au8[3] ? UINT8_MAX : 0;
7919 puDst->au8[4] = puSrc1->au8[4] == puSrc2->au8[4] ? UINT8_MAX : 0;
7920 puDst->au8[5] = puSrc1->au8[5] == puSrc2->au8[5] ? UINT8_MAX : 0;
7921 puDst->au8[6] = puSrc1->au8[6] == puSrc2->au8[6] ? UINT8_MAX : 0;
7922 puDst->au8[7] = puSrc1->au8[7] == puSrc2->au8[7] ? UINT8_MAX : 0;
7923 puDst->au8[8] = puSrc1->au8[8] == puSrc2->au8[8] ? UINT8_MAX : 0;
7924 puDst->au8[9] = puSrc1->au8[9] == puSrc2->au8[9] ? UINT8_MAX : 0;
7925 puDst->au8[10] = puSrc1->au8[10] == puSrc2->au8[10] ? UINT8_MAX : 0;
7926 puDst->au8[11] = puSrc1->au8[11] == puSrc2->au8[11] ? UINT8_MAX : 0;
7927 puDst->au8[12] = puSrc1->au8[12] == puSrc2->au8[12] ? UINT8_MAX : 0;
7928 puDst->au8[13] = puSrc1->au8[13] == puSrc2->au8[13] ? UINT8_MAX : 0;
7929 puDst->au8[14] = puSrc1->au8[14] == puSrc2->au8[14] ? UINT8_MAX : 0;
7930 puDst->au8[15] = puSrc1->au8[15] == puSrc2->au8[15] ? UINT8_MAX : 0;
7931 puDst->au8[16] = puSrc1->au8[16] == puSrc2->au8[16] ? UINT8_MAX : 0;
7932 puDst->au8[17] = puSrc1->au8[17] == puSrc2->au8[17] ? UINT8_MAX : 0;
7933 puDst->au8[18] = puSrc1->au8[18] == puSrc2->au8[18] ? UINT8_MAX : 0;
7934 puDst->au8[19] = puSrc1->au8[19] == puSrc2->au8[19] ? UINT8_MAX : 0;
7935 puDst->au8[20] = puSrc1->au8[20] == puSrc2->au8[20] ? UINT8_MAX : 0;
7936 puDst->au8[21] = puSrc1->au8[21] == puSrc2->au8[21] ? UINT8_MAX : 0;
7937 puDst->au8[22] = puSrc1->au8[22] == puSrc2->au8[22] ? UINT8_MAX : 0;
7938 puDst->au8[23] = puSrc1->au8[23] == puSrc2->au8[23] ? UINT8_MAX : 0;
7939 puDst->au8[24] = puSrc1->au8[24] == puSrc2->au8[24] ? UINT8_MAX : 0;
7940 puDst->au8[25] = puSrc1->au8[25] == puSrc2->au8[25] ? UINT8_MAX : 0;
7941 puDst->au8[26] = puSrc1->au8[26] == puSrc2->au8[26] ? UINT8_MAX : 0;
7942 puDst->au8[27] = puSrc1->au8[27] == puSrc2->au8[27] ? UINT8_MAX : 0;
7943 puDst->au8[28] = puSrc1->au8[28] == puSrc2->au8[28] ? UINT8_MAX : 0;
7944 puDst->au8[29] = puSrc1->au8[29] == puSrc2->au8[29] ? UINT8_MAX : 0;
7945 puDst->au8[30] = puSrc1->au8[30] == puSrc2->au8[30] ? UINT8_MAX : 0;
7946 puDst->au8[31] = puSrc1->au8[31] == puSrc2->au8[31] ? UINT8_MAX : 0;
7947}
7948
7949
7950/*
7951 * PCMPEQW / VPCMPEQW
7952 */
7953#ifdef IEM_WITHOUT_ASSEMBLY
7954
7955IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
7956{
7957 RT_NOREF(pFpuState);
7958 RTUINT64U uSrc1 = { *puDst };
7959 RTUINT64U uSrc2 = { *puSrc };
7960 RTUINT64U uDst;
7961 uDst.au16[0] = uSrc1.au16[0] == uSrc2.au16[0] ? UINT16_MAX : 0;
7962 uDst.au16[1] = uSrc1.au16[1] == uSrc2.au16[1] ? UINT16_MAX : 0;
7963 uDst.au16[2] = uSrc1.au16[2] == uSrc2.au16[2] ? UINT16_MAX : 0;
7964 uDst.au16[3] = uSrc1.au16[3] == uSrc2.au16[3] ? UINT16_MAX : 0;
7965 *puDst = uDst.u;
7966}
7967
7968
7969IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
7970{
7971 RT_NOREF(pFpuState);
7972 RTUINT128U uSrc1 = *puDst;
7973 puDst->au16[0] = uSrc1.au16[0] == puSrc->au16[0] ? UINT16_MAX : 0;
7974 puDst->au16[1] = uSrc1.au16[1] == puSrc->au16[1] ? UINT16_MAX : 0;
7975 puDst->au16[2] = uSrc1.au16[2] == puSrc->au16[2] ? UINT16_MAX : 0;
7976 puDst->au16[3] = uSrc1.au16[3] == puSrc->au16[3] ? UINT16_MAX : 0;
7977 puDst->au16[4] = uSrc1.au16[4] == puSrc->au16[4] ? UINT16_MAX : 0;
7978 puDst->au16[5] = uSrc1.au16[5] == puSrc->au16[5] ? UINT16_MAX : 0;
7979 puDst->au16[6] = uSrc1.au16[6] == puSrc->au16[6] ? UINT16_MAX : 0;
7980 puDst->au16[7] = uSrc1.au16[7] == puSrc->au16[7] ? UINT16_MAX : 0;
7981}
7982
7983#endif
7984
7985IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
7986 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
7987{
7988 RT_NOREF(pExtState);
7989 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
7990 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
7991 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
7992 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
7993 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
7994 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
7995 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
7996 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
7997}
7998
7999IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8000 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8001{
8002 RT_NOREF(pExtState);
8003 puDst->au16[0] = puSrc1->au16[0] == puSrc2->au16[0] ? UINT16_MAX : 0;
8004 puDst->au16[1] = puSrc1->au16[1] == puSrc2->au16[1] ? UINT16_MAX : 0;
8005 puDst->au16[2] = puSrc1->au16[2] == puSrc2->au16[2] ? UINT16_MAX : 0;
8006 puDst->au16[3] = puSrc1->au16[3] == puSrc2->au16[3] ? UINT16_MAX : 0;
8007 puDst->au16[4] = puSrc1->au16[4] == puSrc2->au16[4] ? UINT16_MAX : 0;
8008 puDst->au16[5] = puSrc1->au16[5] == puSrc2->au16[5] ? UINT16_MAX : 0;
8009 puDst->au16[6] = puSrc1->au16[6] == puSrc2->au16[6] ? UINT16_MAX : 0;
8010 puDst->au16[7] = puSrc1->au16[7] == puSrc2->au16[7] ? UINT16_MAX : 0;
8011 puDst->au16[8] = puSrc1->au16[8] == puSrc2->au16[8] ? UINT16_MAX : 0;
8012 puDst->au16[9] = puSrc1->au16[9] == puSrc2->au16[9] ? UINT16_MAX : 0;
8013 puDst->au16[10] = puSrc1->au16[10] == puSrc2->au16[10] ? UINT16_MAX : 0;
8014 puDst->au16[11] = puSrc1->au16[11] == puSrc2->au16[11] ? UINT16_MAX : 0;
8015 puDst->au16[12] = puSrc1->au16[12] == puSrc2->au16[12] ? UINT16_MAX : 0;
8016 puDst->au16[13] = puSrc1->au16[13] == puSrc2->au16[13] ? UINT16_MAX : 0;
8017 puDst->au16[14] = puSrc1->au16[14] == puSrc2->au16[14] ? UINT16_MAX : 0;
8018 puDst->au16[15] = puSrc1->au16[15] == puSrc2->au16[15] ? UINT16_MAX : 0;
8019}
8020
8021
8022/*
8023 * PCMPEQD / VPCMPEQD.
8024 */
8025#ifdef IEM_WITHOUT_ASSEMBLY
8026
8027IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8028{
8029 RT_NOREF(pFpuState);
8030 RTUINT64U uSrc1 = { *puDst };
8031 RTUINT64U uSrc2 = { *puSrc };
8032 RTUINT64U uDst;
8033 uDst.au32[0] = uSrc1.au32[0] == uSrc2.au32[0] ? UINT32_MAX : 0;
8034 uDst.au32[1] = uSrc1.au32[1] == uSrc2.au32[1] ? UINT32_MAX : 0;
8035 *puDst = uDst.u;
8036}
8037
8038
8039IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8040{
8041 RT_NOREF(pFpuState);
8042 RTUINT128U uSrc1 = *puDst;
8043 puDst->au32[0] = uSrc1.au32[0] == puSrc->au32[0] ? UINT32_MAX : 0;
8044 puDst->au32[1] = uSrc1.au32[1] == puSrc->au32[1] ? UINT32_MAX : 0;
8045 puDst->au32[2] = uSrc1.au32[2] == puSrc->au32[2] ? UINT32_MAX : 0;
8046 puDst->au32[3] = uSrc1.au32[3] == puSrc->au32[3] ? UINT32_MAX : 0;
8047}
8048
8049#endif /* IEM_WITHOUT_ASSEMBLY */
8050
8051IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8052 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8053{
8054 RT_NOREF(pExtState);
8055 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8056 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8057 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8058 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8059}
8060
8061IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8062 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8063{
8064 RT_NOREF(pExtState);
8065 puDst->au32[0] = puSrc1->au32[0] == puSrc2->au32[0] ? UINT32_MAX : 0;
8066 puDst->au32[1] = puSrc1->au32[1] == puSrc2->au32[1] ? UINT32_MAX : 0;
8067 puDst->au32[2] = puSrc1->au32[2] == puSrc2->au32[2] ? UINT32_MAX : 0;
8068 puDst->au32[3] = puSrc1->au32[3] == puSrc2->au32[3] ? UINT32_MAX : 0;
8069 puDst->au32[4] = puSrc1->au32[4] == puSrc2->au32[4] ? UINT32_MAX : 0;
8070 puDst->au32[5] = puSrc1->au32[5] == puSrc2->au32[5] ? UINT32_MAX : 0;
8071 puDst->au32[6] = puSrc1->au32[6] == puSrc2->au32[6] ? UINT32_MAX : 0;
8072 puDst->au32[7] = puSrc1->au32[7] == puSrc2->au32[7] ? UINT32_MAX : 0;
8073}
8074
8075
8076/*
8077 * PCMPEQQ / VPCMPEQQ.
8078 */
8079IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpeqq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8080{
8081 RT_NOREF(pFpuState);
8082 RTUINT128U uSrc1 = *puDst;
8083 puDst->au64[0] = uSrc1.au64[0] == puSrc->au64[0] ? UINT64_MAX : 0;
8084 puDst->au64[1] = uSrc1.au64[1] == puSrc->au64[1] ? UINT64_MAX : 0;
8085}
8086
8087IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8088 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8089{
8090 RT_NOREF(pExtState);
8091 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8092 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8093}
8094
8095IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpeqq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8096 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8097{
8098 RT_NOREF(pExtState);
8099 puDst->au64[0] = puSrc1->au64[0] == puSrc2->au64[0] ? UINT64_MAX : 0;
8100 puDst->au64[1] = puSrc1->au64[1] == puSrc2->au64[1] ? UINT64_MAX : 0;
8101 puDst->au64[2] = puSrc1->au64[2] == puSrc2->au64[2] ? UINT64_MAX : 0;
8102 puDst->au64[3] = puSrc1->au64[3] == puSrc2->au64[3] ? UINT64_MAX : 0;
8103}
8104
8105
8106/*
8107 * PCMPGTB / VPCMPGTB
8108 */
8109#ifdef IEM_WITHOUT_ASSEMBLY
8110
8111IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8112{
8113 RT_NOREF(pFpuState);
8114 RTUINT64U uSrc1 = { *puDst };
8115 RTUINT64U uSrc2 = { *puSrc };
8116 RTUINT64U uDst;
8117 uDst.au8[0] = uSrc1.ai8[0] > uSrc2.ai8[0] ? UINT8_MAX : 0;
8118 uDst.au8[1] = uSrc1.ai8[1] > uSrc2.ai8[1] ? UINT8_MAX : 0;
8119 uDst.au8[2] = uSrc1.ai8[2] > uSrc2.ai8[2] ? UINT8_MAX : 0;
8120 uDst.au8[3] = uSrc1.ai8[3] > uSrc2.ai8[3] ? UINT8_MAX : 0;
8121 uDst.au8[4] = uSrc1.ai8[4] > uSrc2.ai8[4] ? UINT8_MAX : 0;
8122 uDst.au8[5] = uSrc1.ai8[5] > uSrc2.ai8[5] ? UINT8_MAX : 0;
8123 uDst.au8[6] = uSrc1.ai8[6] > uSrc2.ai8[6] ? UINT8_MAX : 0;
8124 uDst.au8[7] = uSrc1.ai8[7] > uSrc2.ai8[7] ? UINT8_MAX : 0;
8125 *puDst = uDst.u;
8126}
8127
8128
8129IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8130{
8131 RT_NOREF(pFpuState);
8132 RTUINT128U uSrc1 = *puDst;
8133 puDst->au8[0] = uSrc1.ai8[0] > puSrc->ai8[0] ? UINT8_MAX : 0;
8134 puDst->au8[1] = uSrc1.ai8[1] > puSrc->ai8[1] ? UINT8_MAX : 0;
8135 puDst->au8[2] = uSrc1.ai8[2] > puSrc->ai8[2] ? UINT8_MAX : 0;
8136 puDst->au8[3] = uSrc1.ai8[3] > puSrc->ai8[3] ? UINT8_MAX : 0;
8137 puDst->au8[4] = uSrc1.ai8[4] > puSrc->ai8[4] ? UINT8_MAX : 0;
8138 puDst->au8[5] = uSrc1.ai8[5] > puSrc->ai8[5] ? UINT8_MAX : 0;
8139 puDst->au8[6] = uSrc1.ai8[6] > puSrc->ai8[6] ? UINT8_MAX : 0;
8140 puDst->au8[7] = uSrc1.ai8[7] > puSrc->ai8[7] ? UINT8_MAX : 0;
8141 puDst->au8[8] = uSrc1.ai8[8] > puSrc->ai8[8] ? UINT8_MAX : 0;
8142 puDst->au8[9] = uSrc1.ai8[9] > puSrc->ai8[9] ? UINT8_MAX : 0;
8143 puDst->au8[10] = uSrc1.ai8[10] > puSrc->ai8[10] ? UINT8_MAX : 0;
8144 puDst->au8[11] = uSrc1.ai8[11] > puSrc->ai8[11] ? UINT8_MAX : 0;
8145 puDst->au8[12] = uSrc1.ai8[12] > puSrc->ai8[12] ? UINT8_MAX : 0;
8146 puDst->au8[13] = uSrc1.ai8[13] > puSrc->ai8[13] ? UINT8_MAX : 0;
8147 puDst->au8[14] = uSrc1.ai8[14] > puSrc->ai8[14] ? UINT8_MAX : 0;
8148 puDst->au8[15] = uSrc1.ai8[15] > puSrc->ai8[15] ? UINT8_MAX : 0;
8149}
8150
8151#endif
8152
8153IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8154 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8155{
8156 RT_NOREF(pExtState);
8157 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8158 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8159 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8160 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8161 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8162 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8163 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8164 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8165 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8166 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8167 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8168 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8169 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8170 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8171 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8172 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8173}
8174
8175IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8176 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8177{
8178 RT_NOREF(pExtState);
8179 puDst->au8[0] = puSrc1->ai8[0] > puSrc2->ai8[0] ? UINT8_MAX : 0;
8180 puDst->au8[1] = puSrc1->ai8[1] > puSrc2->ai8[1] ? UINT8_MAX : 0;
8181 puDst->au8[2] = puSrc1->ai8[2] > puSrc2->ai8[2] ? UINT8_MAX : 0;
8182 puDst->au8[3] = puSrc1->ai8[3] > puSrc2->ai8[3] ? UINT8_MAX : 0;
8183 puDst->au8[4] = puSrc1->ai8[4] > puSrc2->ai8[4] ? UINT8_MAX : 0;
8184 puDst->au8[5] = puSrc1->ai8[5] > puSrc2->ai8[5] ? UINT8_MAX : 0;
8185 puDst->au8[6] = puSrc1->ai8[6] > puSrc2->ai8[6] ? UINT8_MAX : 0;
8186 puDst->au8[7] = puSrc1->ai8[7] > puSrc2->ai8[7] ? UINT8_MAX : 0;
8187 puDst->au8[8] = puSrc1->ai8[8] > puSrc2->ai8[8] ? UINT8_MAX : 0;
8188 puDst->au8[9] = puSrc1->ai8[9] > puSrc2->ai8[9] ? UINT8_MAX : 0;
8189 puDst->au8[10] = puSrc1->ai8[10] > puSrc2->ai8[10] ? UINT8_MAX : 0;
8190 puDst->au8[11] = puSrc1->ai8[11] > puSrc2->ai8[11] ? UINT8_MAX : 0;
8191 puDst->au8[12] = puSrc1->ai8[12] > puSrc2->ai8[12] ? UINT8_MAX : 0;
8192 puDst->au8[13] = puSrc1->ai8[13] > puSrc2->ai8[13] ? UINT8_MAX : 0;
8193 puDst->au8[14] = puSrc1->ai8[14] > puSrc2->ai8[14] ? UINT8_MAX : 0;
8194 puDst->au8[15] = puSrc1->ai8[15] > puSrc2->ai8[15] ? UINT8_MAX : 0;
8195 puDst->au8[16] = puSrc1->ai8[16] > puSrc2->ai8[16] ? UINT8_MAX : 0;
8196 puDst->au8[17] = puSrc1->ai8[17] > puSrc2->ai8[17] ? UINT8_MAX : 0;
8197 puDst->au8[18] = puSrc1->ai8[18] > puSrc2->ai8[18] ? UINT8_MAX : 0;
8198 puDst->au8[19] = puSrc1->ai8[19] > puSrc2->ai8[19] ? UINT8_MAX : 0;
8199 puDst->au8[20] = puSrc1->ai8[20] > puSrc2->ai8[20] ? UINT8_MAX : 0;
8200 puDst->au8[21] = puSrc1->ai8[21] > puSrc2->ai8[21] ? UINT8_MAX : 0;
8201 puDst->au8[22] = puSrc1->ai8[22] > puSrc2->ai8[22] ? UINT8_MAX : 0;
8202 puDst->au8[23] = puSrc1->ai8[23] > puSrc2->ai8[23] ? UINT8_MAX : 0;
8203 puDst->au8[24] = puSrc1->ai8[24] > puSrc2->ai8[24] ? UINT8_MAX : 0;
8204 puDst->au8[25] = puSrc1->ai8[25] > puSrc2->ai8[25] ? UINT8_MAX : 0;
8205 puDst->au8[26] = puSrc1->ai8[26] > puSrc2->ai8[26] ? UINT8_MAX : 0;
8206 puDst->au8[27] = puSrc1->ai8[27] > puSrc2->ai8[27] ? UINT8_MAX : 0;
8207 puDst->au8[28] = puSrc1->ai8[28] > puSrc2->ai8[28] ? UINT8_MAX : 0;
8208 puDst->au8[29] = puSrc1->ai8[29] > puSrc2->ai8[29] ? UINT8_MAX : 0;
8209 puDst->au8[30] = puSrc1->ai8[30] > puSrc2->ai8[30] ? UINT8_MAX : 0;
8210 puDst->au8[31] = puSrc1->ai8[31] > puSrc2->ai8[31] ? UINT8_MAX : 0;
8211}
8212
8213
8214/*
8215 * PCMPGTW / VPCMPGTW
8216 */
8217#ifdef IEM_WITHOUT_ASSEMBLY
8218
8219IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8220{
8221 RT_NOREF(pFpuState);
8222 RTUINT64U uSrc1 = { *puDst };
8223 RTUINT64U uSrc2 = { *puSrc };
8224 RTUINT64U uDst;
8225 uDst.au16[0] = uSrc1.ai16[0] > uSrc2.ai16[0] ? UINT16_MAX : 0;
8226 uDst.au16[1] = uSrc1.ai16[1] > uSrc2.ai16[1] ? UINT16_MAX : 0;
8227 uDst.au16[2] = uSrc1.ai16[2] > uSrc2.ai16[2] ? UINT16_MAX : 0;
8228 uDst.au16[3] = uSrc1.ai16[3] > uSrc2.ai16[3] ? UINT16_MAX : 0;
8229 *puDst = uDst.u;
8230}
8231
8232
8233IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8234{
8235 RT_NOREF(pFpuState);
8236 RTUINT128U uSrc1 = *puDst;
8237 puDst->au16[0] = uSrc1.ai16[0] > puSrc->ai16[0] ? UINT16_MAX : 0;
8238 puDst->au16[1] = uSrc1.ai16[1] > puSrc->ai16[1] ? UINT16_MAX : 0;
8239 puDst->au16[2] = uSrc1.ai16[2] > puSrc->ai16[2] ? UINT16_MAX : 0;
8240 puDst->au16[3] = uSrc1.ai16[3] > puSrc->ai16[3] ? UINT16_MAX : 0;
8241 puDst->au16[4] = uSrc1.ai16[4] > puSrc->ai16[4] ? UINT16_MAX : 0;
8242 puDst->au16[5] = uSrc1.ai16[5] > puSrc->ai16[5] ? UINT16_MAX : 0;
8243 puDst->au16[6] = uSrc1.ai16[6] > puSrc->ai16[6] ? UINT16_MAX : 0;
8244 puDst->au16[7] = uSrc1.ai16[7] > puSrc->ai16[7] ? UINT16_MAX : 0;
8245}
8246
8247#endif
8248
8249IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8250 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8251{
8252 RT_NOREF(pExtState);
8253 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8254 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8255 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8256 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8257 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8258 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8259 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8260 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8261}
8262
8263IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8264 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8265{
8266 RT_NOREF(pExtState);
8267 puDst->au16[0] = puSrc1->ai16[0] > puSrc2->ai16[0] ? UINT16_MAX : 0;
8268 puDst->au16[1] = puSrc1->ai16[1] > puSrc2->ai16[1] ? UINT16_MAX : 0;
8269 puDst->au16[2] = puSrc1->ai16[2] > puSrc2->ai16[2] ? UINT16_MAX : 0;
8270 puDst->au16[3] = puSrc1->ai16[3] > puSrc2->ai16[3] ? UINT16_MAX : 0;
8271 puDst->au16[4] = puSrc1->ai16[4] > puSrc2->ai16[4] ? UINT16_MAX : 0;
8272 puDst->au16[5] = puSrc1->ai16[5] > puSrc2->ai16[5] ? UINT16_MAX : 0;
8273 puDst->au16[6] = puSrc1->ai16[6] > puSrc2->ai16[6] ? UINT16_MAX : 0;
8274 puDst->au16[7] = puSrc1->ai16[7] > puSrc2->ai16[7] ? UINT16_MAX : 0;
8275 puDst->au16[8] = puSrc1->ai16[8] > puSrc2->ai16[8] ? UINT16_MAX : 0;
8276 puDst->au16[9] = puSrc1->ai16[9] > puSrc2->ai16[9] ? UINT16_MAX : 0;
8277 puDst->au16[10] = puSrc1->ai16[10] > puSrc2->ai16[10] ? UINT16_MAX : 0;
8278 puDst->au16[11] = puSrc1->ai16[11] > puSrc2->ai16[11] ? UINT16_MAX : 0;
8279 puDst->au16[12] = puSrc1->ai16[12] > puSrc2->ai16[12] ? UINT16_MAX : 0;
8280 puDst->au16[13] = puSrc1->ai16[13] > puSrc2->ai16[13] ? UINT16_MAX : 0;
8281 puDst->au16[14] = puSrc1->ai16[14] > puSrc2->ai16[14] ? UINT16_MAX : 0;
8282 puDst->au16[15] = puSrc1->ai16[15] > puSrc2->ai16[15] ? UINT16_MAX : 0;
8283}
8284
8285
8286/*
8287 * PCMPGTD / VPCMPGTD.
8288 */
8289#ifdef IEM_WITHOUT_ASSEMBLY
8290
8291IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8292{
8293 RT_NOREF(pFpuState);
8294 RTUINT64U uSrc1 = { *puDst };
8295 RTUINT64U uSrc2 = { *puSrc };
8296 RTUINT64U uDst;
8297 uDst.au32[0] = uSrc1.ai32[0] > uSrc2.ai32[0] ? UINT32_MAX : 0;
8298 uDst.au32[1] = uSrc1.ai32[1] > uSrc2.ai32[1] ? UINT32_MAX : 0;
8299 *puDst = uDst.u;
8300}
8301
8302
8303IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8304{
8305 RT_NOREF(pFpuState);
8306 RTUINT128U uSrc1 = *puDst;
8307 puDst->au32[0] = uSrc1.ai32[0] > puSrc->ai32[0] ? UINT32_MAX : 0;
8308 puDst->au32[1] = uSrc1.ai32[1] > puSrc->ai32[1] ? UINT32_MAX : 0;
8309 puDst->au32[2] = uSrc1.ai32[2] > puSrc->ai32[2] ? UINT32_MAX : 0;
8310 puDst->au32[3] = uSrc1.ai32[3] > puSrc->ai32[3] ? UINT32_MAX : 0;
8311}
8312
8313#endif /* IEM_WITHOUT_ASSEMBLY */
8314
8315IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8316 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8317{
8318 RT_NOREF(pExtState);
8319 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8320 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8321 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8322 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8323}
8324
8325IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8326 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8327{
8328 RT_NOREF(pExtState);
8329 puDst->au32[0] = puSrc1->ai32[0] > puSrc2->ai32[0] ? UINT32_MAX : 0;
8330 puDst->au32[1] = puSrc1->ai32[1] > puSrc2->ai32[1] ? UINT32_MAX : 0;
8331 puDst->au32[2] = puSrc1->ai32[2] > puSrc2->ai32[2] ? UINT32_MAX : 0;
8332 puDst->au32[3] = puSrc1->ai32[3] > puSrc2->ai32[3] ? UINT32_MAX : 0;
8333 puDst->au32[4] = puSrc1->ai32[4] > puSrc2->ai32[4] ? UINT32_MAX : 0;
8334 puDst->au32[5] = puSrc1->ai32[5] > puSrc2->ai32[5] ? UINT32_MAX : 0;
8335 puDst->au32[6] = puSrc1->ai32[6] > puSrc2->ai32[6] ? UINT32_MAX : 0;
8336 puDst->au32[7] = puSrc1->ai32[7] > puSrc2->ai32[7] ? UINT32_MAX : 0;
8337}
8338
8339
8340/*
8341 * PCMPGTQ / VPCMPGTQ.
8342 */
8343IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpgtq_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8344{
8345 RT_NOREF(pFpuState);
8346 RTUINT128U uSrc1 = *puDst;
8347 puDst->au64[0] = uSrc1.ai64[0] > puSrc->ai64[0] ? UINT64_MAX : 0;
8348 puDst->au64[1] = uSrc1.ai64[1] > puSrc->ai64[1] ? UINT64_MAX : 0;
8349}
8350
8351IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8352 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8353{
8354 RT_NOREF(pExtState);
8355 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8356 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8357}
8358
8359IEM_DECL_IMPL_DEF(void, iemAImpl_vpcmpgtq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8360 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8361{
8362 RT_NOREF(pExtState);
8363 puDst->au64[0] = puSrc1->ai64[0] > puSrc2->ai64[0] ? UINT64_MAX : 0;
8364 puDst->au64[1] = puSrc1->ai64[1] > puSrc2->ai64[1] ? UINT64_MAX : 0;
8365 puDst->au64[2] = puSrc1->ai64[2] > puSrc2->ai64[2] ? UINT64_MAX : 0;
8366 puDst->au64[3] = puSrc1->ai64[3] > puSrc2->ai64[3] ? UINT64_MAX : 0;
8367}
8368
8369
8370/*
8371 * PADDB / VPADDB
8372 */
8373#ifdef IEM_WITHOUT_ASSEMBLY
8374
8375IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8376{
8377 RT_NOREF(pFpuState);
8378 RTUINT64U uSrc1 = { *puDst };
8379 RTUINT64U uSrc2 = { *puSrc };
8380 RTUINT64U uDst;
8381 uDst.au8[0] = uSrc1.au8[0] + uSrc2.au8[0];
8382 uDst.au8[1] = uSrc1.au8[1] + uSrc2.au8[1];
8383 uDst.au8[2] = uSrc1.au8[2] + uSrc2.au8[2];
8384 uDst.au8[3] = uSrc1.au8[3] + uSrc2.au8[3];
8385 uDst.au8[4] = uSrc1.au8[4] + uSrc2.au8[4];
8386 uDst.au8[5] = uSrc1.au8[5] + uSrc2.au8[5];
8387 uDst.au8[6] = uSrc1.au8[6] + uSrc2.au8[6];
8388 uDst.au8[7] = uSrc1.au8[7] + uSrc2.au8[7];
8389 *puDst = uDst.u;
8390}
8391
8392
8393IEM_DECL_IMPL_DEF(void, iemAImpl_paddb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8394{
8395 RT_NOREF(pFpuState);
8396 RTUINT128U uSrc1 = *puDst;
8397 puDst->au8[0] = uSrc1.au8[0] + puSrc->au8[0];
8398 puDst->au8[1] = uSrc1.au8[1] + puSrc->au8[1];
8399 puDst->au8[2] = uSrc1.au8[2] + puSrc->au8[2];
8400 puDst->au8[3] = uSrc1.au8[3] + puSrc->au8[3];
8401 puDst->au8[4] = uSrc1.au8[4] + puSrc->au8[4];
8402 puDst->au8[5] = uSrc1.au8[5] + puSrc->au8[5];
8403 puDst->au8[6] = uSrc1.au8[6] + puSrc->au8[6];
8404 puDst->au8[7] = uSrc1.au8[7] + puSrc->au8[7];
8405 puDst->au8[8] = uSrc1.au8[8] + puSrc->au8[8];
8406 puDst->au8[9] = uSrc1.au8[9] + puSrc->au8[9];
8407 puDst->au8[10] = uSrc1.au8[10] + puSrc->au8[10];
8408 puDst->au8[11] = uSrc1.au8[11] + puSrc->au8[11];
8409 puDst->au8[12] = uSrc1.au8[12] + puSrc->au8[12];
8410 puDst->au8[13] = uSrc1.au8[13] + puSrc->au8[13];
8411 puDst->au8[14] = uSrc1.au8[14] + puSrc->au8[14];
8412 puDst->au8[15] = uSrc1.au8[15] + puSrc->au8[15];
8413}
8414
8415#endif
8416
8417
8418IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8419 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8420{
8421 RT_NOREF(pExtState);
8422 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8423 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8424 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8425 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8426 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8427 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8428 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8429 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8430 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8431 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8432 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8433 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8434 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8435 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8436 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8437 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8438}
8439
8440IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8441 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8442{
8443 RT_NOREF(pExtState);
8444 puDst->au8[0] = puSrc1->au8[0] + puSrc2->au8[0];
8445 puDst->au8[1] = puSrc1->au8[1] + puSrc2->au8[1];
8446 puDst->au8[2] = puSrc1->au8[2] + puSrc2->au8[2];
8447 puDst->au8[3] = puSrc1->au8[3] + puSrc2->au8[3];
8448 puDst->au8[4] = puSrc1->au8[4] + puSrc2->au8[4];
8449 puDst->au8[5] = puSrc1->au8[5] + puSrc2->au8[5];
8450 puDst->au8[6] = puSrc1->au8[6] + puSrc2->au8[6];
8451 puDst->au8[7] = puSrc1->au8[7] + puSrc2->au8[7];
8452 puDst->au8[8] = puSrc1->au8[8] + puSrc2->au8[8];
8453 puDst->au8[9] = puSrc1->au8[9] + puSrc2->au8[9];
8454 puDst->au8[10] = puSrc1->au8[10] + puSrc2->au8[10];
8455 puDst->au8[11] = puSrc1->au8[11] + puSrc2->au8[11];
8456 puDst->au8[12] = puSrc1->au8[12] + puSrc2->au8[12];
8457 puDst->au8[13] = puSrc1->au8[13] + puSrc2->au8[13];
8458 puDst->au8[14] = puSrc1->au8[14] + puSrc2->au8[14];
8459 puDst->au8[15] = puSrc1->au8[15] + puSrc2->au8[15];
8460 puDst->au8[16] = puSrc1->au8[16] + puSrc2->au8[16];
8461 puDst->au8[17] = puSrc1->au8[17] + puSrc2->au8[17];
8462 puDst->au8[18] = puSrc1->au8[18] + puSrc2->au8[18];
8463 puDst->au8[19] = puSrc1->au8[19] + puSrc2->au8[19];
8464 puDst->au8[20] = puSrc1->au8[20] + puSrc2->au8[20];
8465 puDst->au8[21] = puSrc1->au8[21] + puSrc2->au8[21];
8466 puDst->au8[22] = puSrc1->au8[22] + puSrc2->au8[22];
8467 puDst->au8[23] = puSrc1->au8[23] + puSrc2->au8[23];
8468 puDst->au8[24] = puSrc1->au8[24] + puSrc2->au8[24];
8469 puDst->au8[25] = puSrc1->au8[25] + puSrc2->au8[25];
8470 puDst->au8[26] = puSrc1->au8[26] + puSrc2->au8[26];
8471 puDst->au8[27] = puSrc1->au8[27] + puSrc2->au8[27];
8472 puDst->au8[28] = puSrc1->au8[28] + puSrc2->au8[28];
8473 puDst->au8[29] = puSrc1->au8[29] + puSrc2->au8[29];
8474 puDst->au8[30] = puSrc1->au8[30] + puSrc2->au8[30];
8475 puDst->au8[31] = puSrc1->au8[31] + puSrc2->au8[31];
8476}
8477
8478
8479/*
8480 * PADDSB / VPADDSB
8481 */
8482#define SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(a_iWord) \
8483 ( (uint16_t)((a_iWord) + 0x80) <= (uint16_t)0xff \
8484 ? (uint8_t)(a_iWord) \
8485 : (uint8_t)0x7f + (uint8_t)(((a_iWord) >> 15) & 1) ) /* 0x7f = INT8_MAX; 0x80 = INT8_MIN; source bit 15 = sign */
8486
8487#ifdef IEM_WITHOUT_ASSEMBLY
8488
8489IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8490{
8491 RT_NOREF(pFpuState);
8492 RTUINT64U uSrc1 = { *puDst };
8493 RTUINT64U uSrc2 = { *puSrc };
8494 RTUINT64U uDst;
8495 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + uSrc2.ai8[0]);
8496 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + uSrc2.ai8[1]);
8497 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + uSrc2.ai8[2]);
8498 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + uSrc2.ai8[3]);
8499 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + uSrc2.ai8[4]);
8500 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + uSrc2.ai8[5]);
8501 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + uSrc2.ai8[6]);
8502 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + uSrc2.ai8[7]);
8503 *puDst = uDst.u;
8504}
8505
8506
8507IEM_DECL_IMPL_DEF(void, iemAImpl_paddsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8508{
8509 RT_NOREF(pFpuState);
8510 RTUINT128U uSrc1 = *puDst;
8511 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] + puSrc->ai8[0]);
8512 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] + puSrc->ai8[1]);
8513 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] + puSrc->ai8[2]);
8514 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] + puSrc->ai8[3]);
8515 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] + puSrc->ai8[4]);
8516 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] + puSrc->ai8[5]);
8517 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] + puSrc->ai8[6]);
8518 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] + puSrc->ai8[7]);
8519 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] + puSrc->ai8[8]);
8520 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] + puSrc->ai8[9]);
8521 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] + puSrc->ai8[10]);
8522 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] + puSrc->ai8[11]);
8523 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] + puSrc->ai8[12]);
8524 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] + puSrc->ai8[13]);
8525 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] + puSrc->ai8[14]);
8526 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] + puSrc->ai8[15]);
8527}
8528
8529#endif
8530
8531
8532/*
8533 * PADDSB / VPADDSB
8534 */
8535#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(a_uWord) \
8536 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
8537 ? (uint8_t)(a_uWord) \
8538 : (uint8_t)0xff ) /* 0xff = UINT8_MAX */
8539
8540#ifdef IEM_WITHOUT_ASSEMBLY
8541
8542IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8543{
8544 RT_NOREF(pFpuState);
8545 RTUINT64U uSrc1 = { *puDst };
8546 RTUINT64U uSrc2 = { *puSrc };
8547 RTUINT64U uDst;
8548 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + uSrc2.au8[0]);
8549 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + uSrc2.au8[1]);
8550 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + uSrc2.au8[2]);
8551 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + uSrc2.au8[3]);
8552 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + uSrc2.au8[4]);
8553 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + uSrc2.au8[5]);
8554 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + uSrc2.au8[6]);
8555 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + uSrc2.au8[7]);
8556 *puDst = uDst.u;
8557}
8558
8559
8560IEM_DECL_IMPL_DEF(void, iemAImpl_paddusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8561{
8562 RT_NOREF(pFpuState);
8563 RTUINT128U uSrc1 = *puDst;
8564 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[0] + puSrc->au8[0]);
8565 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[1] + puSrc->au8[1]);
8566 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[2] + puSrc->au8[2]);
8567 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[3] + puSrc->au8[3]);
8568 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[4] + puSrc->au8[4]);
8569 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[5] + puSrc->au8[5]);
8570 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[6] + puSrc->au8[6]);
8571 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[7] + puSrc->au8[7]);
8572 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[8] + puSrc->au8[8]);
8573 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[9] + puSrc->au8[9]);
8574 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[10] + puSrc->au8[10]);
8575 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[11] + puSrc->au8[11]);
8576 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[12] + puSrc->au8[12]);
8577 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[13] + puSrc->au8[13]);
8578 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[14] + puSrc->au8[14]);
8579 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au8[15] + puSrc->au8[15]);
8580}
8581
8582#endif
8583
8584IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u128_fallback,(PRTUINT128U puDst,
8585 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8586{
8587 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8588 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8589 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8590 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8591 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8592 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8593 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8594 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8595 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8596 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8597 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8598 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8599 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8600 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8601 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8602 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8603}
8604
8605IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusb_u256_fallback,(PRTUINT256U puDst,
8606 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8607{
8608 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[0] + puSrc2->au8[0]);
8609 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[1] + puSrc2->au8[1]);
8610 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[2] + puSrc2->au8[2]);
8611 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[3] + puSrc2->au8[3]);
8612 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[4] + puSrc2->au8[4]);
8613 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[5] + puSrc2->au8[5]);
8614 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[6] + puSrc2->au8[6]);
8615 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[7] + puSrc2->au8[7]);
8616 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[8] + puSrc2->au8[8]);
8617 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[9] + puSrc2->au8[9]);
8618 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[10] + puSrc2->au8[10]);
8619 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[11] + puSrc2->au8[11]);
8620 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[12] + puSrc2->au8[12]);
8621 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[13] + puSrc2->au8[13]);
8622 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[14] + puSrc2->au8[14]);
8623 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[15] + puSrc2->au8[15]);
8624 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[16] + puSrc2->au8[16]);
8625 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[17] + puSrc2->au8[17]);
8626 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[18] + puSrc2->au8[18]);
8627 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[19] + puSrc2->au8[19]);
8628 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[20] + puSrc2->au8[20]);
8629 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[21] + puSrc2->au8[21]);
8630 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[22] + puSrc2->au8[22]);
8631 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[23] + puSrc2->au8[23]);
8632 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[24] + puSrc2->au8[24]);
8633 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[25] + puSrc2->au8[25]);
8634 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[26] + puSrc2->au8[26]);
8635 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[27] + puSrc2->au8[27]);
8636 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[28] + puSrc2->au8[28]);
8637 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[29] + puSrc2->au8[29]);
8638 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[30] + puSrc2->au8[30]);
8639 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE(puSrc1->au8[31] + puSrc2->au8[31]);
8640}
8641
8642
8643/*
8644 * PADDW / VPADDW
8645 */
8646#ifdef IEM_WITHOUT_ASSEMBLY
8647
8648IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8649{
8650 RT_NOREF(pFpuState);
8651 RTUINT64U uSrc1 = { *puDst };
8652 RTUINT64U uSrc2 = { *puSrc };
8653 RTUINT64U uDst;
8654 uDst.au16[0] = uSrc1.au16[0] + uSrc2.au16[0];
8655 uDst.au16[1] = uSrc1.au16[1] + uSrc2.au16[1];
8656 uDst.au16[2] = uSrc1.au16[2] + uSrc2.au16[2];
8657 uDst.au16[3] = uSrc1.au16[3] + uSrc2.au16[3];
8658 *puDst = uDst.u;
8659}
8660
8661
8662IEM_DECL_IMPL_DEF(void, iemAImpl_paddw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8663{
8664 RT_NOREF(pFpuState);
8665 RTUINT128U uSrc1 = *puDst;
8666 puDst->au16[0] = uSrc1.au16[0] + puSrc->au16[0];
8667 puDst->au16[1] = uSrc1.au16[1] + puSrc->au16[1];
8668 puDst->au16[2] = uSrc1.au16[2] + puSrc->au16[2];
8669 puDst->au16[3] = uSrc1.au16[3] + puSrc->au16[3];
8670 puDst->au16[4] = uSrc1.au16[4] + puSrc->au16[4];
8671 puDst->au16[5] = uSrc1.au16[5] + puSrc->au16[5];
8672 puDst->au16[6] = uSrc1.au16[6] + puSrc->au16[6];
8673 puDst->au16[7] = uSrc1.au16[7] + puSrc->au16[7];
8674}
8675
8676#endif
8677
8678
8679IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8680 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8681{
8682 RT_NOREF(pExtState);
8683 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8684 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8685 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8686 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8687 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8688 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8689 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8690 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8691}
8692
8693IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8694 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8695{
8696 RT_NOREF(pExtState);
8697 puDst->au16[0] = puSrc1->au16[0] + puSrc2->au16[0];
8698 puDst->au16[1] = puSrc1->au16[1] + puSrc2->au16[1];
8699 puDst->au16[2] = puSrc1->au16[2] + puSrc2->au16[2];
8700 puDst->au16[3] = puSrc1->au16[3] + puSrc2->au16[3];
8701 puDst->au16[4] = puSrc1->au16[4] + puSrc2->au16[4];
8702 puDst->au16[5] = puSrc1->au16[5] + puSrc2->au16[5];
8703 puDst->au16[6] = puSrc1->au16[6] + puSrc2->au16[6];
8704 puDst->au16[7] = puSrc1->au16[7] + puSrc2->au16[7];
8705 puDst->au16[8] = puSrc1->au16[8] + puSrc2->au16[8];
8706 puDst->au16[9] = puSrc1->au16[9] + puSrc2->au16[9];
8707 puDst->au16[10] = puSrc1->au16[10] + puSrc2->au16[10];
8708 puDst->au16[11] = puSrc1->au16[11] + puSrc2->au16[11];
8709 puDst->au16[12] = puSrc1->au16[12] + puSrc2->au16[12];
8710 puDst->au16[13] = puSrc1->au16[13] + puSrc2->au16[13];
8711 puDst->au16[14] = puSrc1->au16[14] + puSrc2->au16[14];
8712 puDst->au16[15] = puSrc1->au16[15] + puSrc2->au16[15];
8713}
8714
8715
8716/*
8717 * PADDSW / VPADDSW
8718 */
8719#define SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(a_iDword) \
8720 ( (uint32_t)((a_iDword) + 0x8000) <= (uint16_t)0xffff \
8721 ? (uint16_t)(a_iDword) \
8722 : (uint16_t)0x7fff + (uint16_t)(((a_iDword) >> 31) & 1) ) /* 0x7fff = INT16_MAX; 0x8000 = INT16_MIN; source bit 31 = sign */
8723
8724#ifdef IEM_WITHOUT_ASSEMBLY
8725
8726IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8727{
8728 RT_NOREF(pFpuState);
8729 RTUINT64U uSrc1 = { *puDst };
8730 RTUINT64U uSrc2 = { *puSrc };
8731 RTUINT64U uDst;
8732 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc2.ai16[0]);
8733 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + uSrc2.ai16[1]);
8734 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc2.ai16[2]);
8735 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + uSrc2.ai16[3]);
8736 *puDst = uDst.u;
8737}
8738
8739
8740IEM_DECL_IMPL_DEF(void, iemAImpl_paddsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8741{
8742 RT_NOREF(pFpuState);
8743 RTUINT128U uSrc1 = *puDst;
8744 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + puSrc->ai16[0]);
8745 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] + puSrc->ai16[1]);
8746 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + puSrc->ai16[2]);
8747 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] + puSrc->ai16[3]);
8748 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + puSrc->ai16[4]);
8749 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] + puSrc->ai16[5]);
8750 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + puSrc->ai16[6]);
8751 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] + puSrc->ai16[7]);
8752}
8753
8754#endif
8755
8756
8757/*
8758 * PADDUSW / VPADDUSW
8759 */
8760#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(a_uDword) \
8761 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
8762 ? (uint16_t)(a_uDword) \
8763 : (uint16_t)0xffff ) /* 0xffff = UINT16_MAX */
8764
8765#ifdef IEM_WITHOUT_ASSEMBLY
8766
8767IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8768{
8769 RT_NOREF(pFpuState);
8770 RTUINT64U uSrc1 = { *puDst };
8771 RTUINT64U uSrc2 = { *puSrc };
8772 RTUINT64U uDst;
8773 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + uSrc2.au16[0]);
8774 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + uSrc2.au16[1]);
8775 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + uSrc2.au16[2]);
8776 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + uSrc2.au16[3]);
8777 *puDst = uDst.u;
8778}
8779
8780
8781IEM_DECL_IMPL_DEF(void, iemAImpl_paddusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8782{
8783 RT_NOREF(pFpuState);
8784 RTUINT128U uSrc1 = *puDst;
8785 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[0] + puSrc->au16[0]);
8786 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[1] + puSrc->au16[1]);
8787 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[2] + puSrc->au16[2]);
8788 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[3] + puSrc->au16[3]);
8789 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[4] + puSrc->au16[4]);
8790 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[5] + puSrc->au16[5]);
8791 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[6] + puSrc->au16[6]);
8792 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au16[7] + puSrc->au16[7]);
8793}
8794
8795#endif
8796
8797IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u128_fallback,(PRTUINT128U puDst,
8798 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8799{
8800 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8801 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8802 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8803 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8804 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8805 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8806 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8807 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8808}
8809
8810IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddusw_u256_fallback,(PRTUINT256U puDst,
8811 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8812{
8813 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[0] + puSrc2->au16[0]);
8814 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[1] + puSrc2->au16[1]);
8815 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[2] + puSrc2->au16[2]);
8816 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[3] + puSrc2->au16[3]);
8817 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[4] + puSrc2->au16[4]);
8818 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[5] + puSrc2->au16[5]);
8819 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[6] + puSrc2->au16[6]);
8820 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[7] + puSrc2->au16[7]);
8821 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[8] + puSrc2->au16[8]);
8822 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[9] + puSrc2->au16[9]);
8823 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[10] + puSrc2->au16[10]);
8824 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[11] + puSrc2->au16[11]);
8825 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[12] + puSrc2->au16[12]);
8826 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[13] + puSrc2->au16[13]);
8827 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[14] + puSrc2->au16[14]);
8828 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD(puSrc1->au16[15] + puSrc2->au16[15]);
8829}
8830
8831
8832/*
8833 * PADDD / VPADDD.
8834 */
8835#ifdef IEM_WITHOUT_ASSEMBLY
8836
8837IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8838{
8839 RT_NOREF(pFpuState);
8840 RTUINT64U uSrc1 = { *puDst };
8841 RTUINT64U uSrc2 = { *puSrc };
8842 RTUINT64U uDst;
8843 uDst.au32[0] = uSrc1.au32[0] + uSrc2.au32[0];
8844 uDst.au32[1] = uSrc1.au32[1] + uSrc2.au32[1];
8845 *puDst = uDst.u;
8846}
8847
8848
8849IEM_DECL_IMPL_DEF(void, iemAImpl_paddd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8850{
8851 RT_NOREF(pFpuState);
8852 RTUINT128U uSrc1 = *puDst;
8853 puDst->au32[0] = uSrc1.au32[0] + puSrc->au32[0];
8854 puDst->au32[1] = uSrc1.au32[1] + puSrc->au32[1];
8855 puDst->au32[2] = uSrc1.au32[2] + puSrc->au32[2];
8856 puDst->au32[3] = uSrc1.au32[3] + puSrc->au32[3];
8857}
8858
8859#endif /* IEM_WITHOUT_ASSEMBLY */
8860
8861IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8862 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8863{
8864 RT_NOREF(pExtState);
8865 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8866 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8867 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8868 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8869}
8870
8871IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8872 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8873{
8874 RT_NOREF(pExtState);
8875 puDst->au32[0] = puSrc1->au32[0] + puSrc2->au32[0];
8876 puDst->au32[1] = puSrc1->au32[1] + puSrc2->au32[1];
8877 puDst->au32[2] = puSrc1->au32[2] + puSrc2->au32[2];
8878 puDst->au32[3] = puSrc1->au32[3] + puSrc2->au32[3];
8879 puDst->au32[4] = puSrc1->au32[4] + puSrc2->au32[4];
8880 puDst->au32[5] = puSrc1->au32[5] + puSrc2->au32[5];
8881 puDst->au32[6] = puSrc1->au32[6] + puSrc2->au32[6];
8882 puDst->au32[7] = puSrc1->au32[7] + puSrc2->au32[7];
8883}
8884
8885
8886/*
8887 * PADDQ / VPADDQ.
8888 */
8889#ifdef IEM_WITHOUT_ASSEMBLY
8890
8891IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8892{
8893 RT_NOREF(pFpuState);
8894 *puDst = *puDst + *puSrc;
8895}
8896
8897IEM_DECL_IMPL_DEF(void, iemAImpl_paddq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8898{
8899 RT_NOREF(pFpuState);
8900 RTUINT128U uSrc1 = *puDst;
8901 puDst->au64[0] = uSrc1.au64[0] + puSrc->au64[0];
8902 puDst->au64[1] = uSrc1.au64[1] + puSrc->au64[1];
8903}
8904
8905#endif
8906
8907IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8908 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8909{
8910 RT_NOREF(pExtState);
8911 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8912 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8913}
8914
8915IEM_DECL_IMPL_DEF(void, iemAImpl_vpaddq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8916 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8917{
8918 RT_NOREF(pExtState);
8919 puDst->au64[0] = puSrc1->au64[0] + puSrc2->au64[0];
8920 puDst->au64[1] = puSrc1->au64[1] + puSrc2->au64[1];
8921 puDst->au64[2] = puSrc1->au64[2] + puSrc2->au64[2];
8922 puDst->au64[3] = puSrc1->au64[3] + puSrc2->au64[3];
8923}
8924
8925
8926/*
8927 * PSUBB / VPSUBB
8928 */
8929#ifdef IEM_WITHOUT_ASSEMBLY
8930
8931IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
8932{
8933 RT_NOREF(pFpuState);
8934 RTUINT64U uSrc1 = { *puDst };
8935 RTUINT64U uSrc2 = { *puSrc };
8936 RTUINT64U uDst;
8937 uDst.au8[0] = uSrc1.au8[0] - uSrc2.au8[0];
8938 uDst.au8[1] = uSrc1.au8[1] - uSrc2.au8[1];
8939 uDst.au8[2] = uSrc1.au8[2] - uSrc2.au8[2];
8940 uDst.au8[3] = uSrc1.au8[3] - uSrc2.au8[3];
8941 uDst.au8[4] = uSrc1.au8[4] - uSrc2.au8[4];
8942 uDst.au8[5] = uSrc1.au8[5] - uSrc2.au8[5];
8943 uDst.au8[6] = uSrc1.au8[6] - uSrc2.au8[6];
8944 uDst.au8[7] = uSrc1.au8[7] - uSrc2.au8[7];
8945 *puDst = uDst.u;
8946}
8947
8948
8949IEM_DECL_IMPL_DEF(void, iemAImpl_psubb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
8950{
8951 RT_NOREF(pFpuState);
8952 RTUINT128U uSrc1 = *puDst;
8953 puDst->au8[0] = uSrc1.au8[0] - puSrc->au8[0];
8954 puDst->au8[1] = uSrc1.au8[1] - puSrc->au8[1];
8955 puDst->au8[2] = uSrc1.au8[2] - puSrc->au8[2];
8956 puDst->au8[3] = uSrc1.au8[3] - puSrc->au8[3];
8957 puDst->au8[4] = uSrc1.au8[4] - puSrc->au8[4];
8958 puDst->au8[5] = uSrc1.au8[5] - puSrc->au8[5];
8959 puDst->au8[6] = uSrc1.au8[6] - puSrc->au8[6];
8960 puDst->au8[7] = uSrc1.au8[7] - puSrc->au8[7];
8961 puDst->au8[8] = uSrc1.au8[8] - puSrc->au8[8];
8962 puDst->au8[9] = uSrc1.au8[9] - puSrc->au8[9];
8963 puDst->au8[10] = uSrc1.au8[10] - puSrc->au8[10];
8964 puDst->au8[11] = uSrc1.au8[11] - puSrc->au8[11];
8965 puDst->au8[12] = uSrc1.au8[12] - puSrc->au8[12];
8966 puDst->au8[13] = uSrc1.au8[13] - puSrc->au8[13];
8967 puDst->au8[14] = uSrc1.au8[14] - puSrc->au8[14];
8968 puDst->au8[15] = uSrc1.au8[15] - puSrc->au8[15];
8969}
8970
8971#endif
8972
8973IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
8974 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
8975{
8976 RT_NOREF(pExtState);
8977 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
8978 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
8979 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
8980 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
8981 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
8982 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
8983 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
8984 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
8985 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
8986 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
8987 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
8988 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
8989 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
8990 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
8991 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
8992 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
8993}
8994
8995IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
8996 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
8997{
8998 RT_NOREF(pExtState);
8999 puDst->au8[0] = puSrc1->au8[0] - puSrc2->au8[0];
9000 puDst->au8[1] = puSrc1->au8[1] - puSrc2->au8[1];
9001 puDst->au8[2] = puSrc1->au8[2] - puSrc2->au8[2];
9002 puDst->au8[3] = puSrc1->au8[3] - puSrc2->au8[3];
9003 puDst->au8[4] = puSrc1->au8[4] - puSrc2->au8[4];
9004 puDst->au8[5] = puSrc1->au8[5] - puSrc2->au8[5];
9005 puDst->au8[6] = puSrc1->au8[6] - puSrc2->au8[6];
9006 puDst->au8[7] = puSrc1->au8[7] - puSrc2->au8[7];
9007 puDst->au8[8] = puSrc1->au8[8] - puSrc2->au8[8];
9008 puDst->au8[9] = puSrc1->au8[9] - puSrc2->au8[9];
9009 puDst->au8[10] = puSrc1->au8[10] - puSrc2->au8[10];
9010 puDst->au8[11] = puSrc1->au8[11] - puSrc2->au8[11];
9011 puDst->au8[12] = puSrc1->au8[12] - puSrc2->au8[12];
9012 puDst->au8[13] = puSrc1->au8[13] - puSrc2->au8[13];
9013 puDst->au8[14] = puSrc1->au8[14] - puSrc2->au8[14];
9014 puDst->au8[15] = puSrc1->au8[15] - puSrc2->au8[15];
9015 puDst->au8[16] = puSrc1->au8[16] - puSrc2->au8[16];
9016 puDst->au8[17] = puSrc1->au8[17] - puSrc2->au8[17];
9017 puDst->au8[18] = puSrc1->au8[18] - puSrc2->au8[18];
9018 puDst->au8[19] = puSrc1->au8[19] - puSrc2->au8[19];
9019 puDst->au8[20] = puSrc1->au8[20] - puSrc2->au8[20];
9020 puDst->au8[21] = puSrc1->au8[21] - puSrc2->au8[21];
9021 puDst->au8[22] = puSrc1->au8[22] - puSrc2->au8[22];
9022 puDst->au8[23] = puSrc1->au8[23] - puSrc2->au8[23];
9023 puDst->au8[24] = puSrc1->au8[24] - puSrc2->au8[24];
9024 puDst->au8[25] = puSrc1->au8[25] - puSrc2->au8[25];
9025 puDst->au8[26] = puSrc1->au8[26] - puSrc2->au8[26];
9026 puDst->au8[27] = puSrc1->au8[27] - puSrc2->au8[27];
9027 puDst->au8[28] = puSrc1->au8[28] - puSrc2->au8[28];
9028 puDst->au8[29] = puSrc1->au8[29] - puSrc2->au8[29];
9029 puDst->au8[30] = puSrc1->au8[30] - puSrc2->au8[30];
9030 puDst->au8[31] = puSrc1->au8[31] - puSrc2->au8[31];
9031}
9032
9033
9034/*
9035 * PSUBSB / VSUBSB
9036 */
9037#ifdef IEM_WITHOUT_ASSEMBLY
9038
9039IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9040{
9041 RT_NOREF(pFpuState);
9042 RTUINT64U uSrc1 = { *puDst };
9043 RTUINT64U uSrc2 = { *puSrc };
9044 RTUINT64U uDst;
9045 uDst.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - uSrc2.ai8[0]);
9046 uDst.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - uSrc2.ai8[1]);
9047 uDst.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - uSrc2.ai8[2]);
9048 uDst.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - uSrc2.ai8[3]);
9049 uDst.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - uSrc2.ai8[4]);
9050 uDst.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - uSrc2.ai8[5]);
9051 uDst.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - uSrc2.ai8[6]);
9052 uDst.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - uSrc2.ai8[7]);
9053 *puDst = uDst.u;
9054}
9055
9056
9057IEM_DECL_IMPL_DEF(void, iemAImpl_psubsb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9058{
9059 RT_NOREF(pFpuState);
9060 RTUINT128U uSrc1 = *puDst;
9061 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[0] - puSrc->ai8[0]);
9062 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[1] - puSrc->ai8[1]);
9063 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[2] - puSrc->ai8[2]);
9064 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[3] - puSrc->ai8[3]);
9065 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[4] - puSrc->ai8[4]);
9066 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[5] - puSrc->ai8[5]);
9067 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[6] - puSrc->ai8[6]);
9068 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[7] - puSrc->ai8[7]);
9069 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[8] - puSrc->ai8[8]);
9070 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[9] - puSrc->ai8[9]);
9071 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[10] - puSrc->ai8[10]);
9072 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[11] - puSrc->ai8[11]);
9073 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[12] - puSrc->ai8[12]);
9074 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[13] - puSrc->ai8[13]);
9075 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[14] - puSrc->ai8[14]);
9076 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.ai8[15] - puSrc->ai8[15]);
9077}
9078
9079#endif
9080
9081IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u128_fallback,(PRTUINT128U puDst,
9082 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9083{
9084 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9085 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9086 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9087 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9088 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9089 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9090 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9091 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9092 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9093 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9094 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9095 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9096 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9097 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9098 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9099 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9100}
9101
9102IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsb_u256_fallback,(PRTUINT256U puDst,
9103 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9104{
9105 puDst->au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[0] - puSrc2->ai8[0]);
9106 puDst->au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[1] - puSrc2->ai8[1]);
9107 puDst->au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[2] - puSrc2->ai8[2]);
9108 puDst->au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[3] - puSrc2->ai8[3]);
9109 puDst->au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[4] - puSrc2->ai8[4]);
9110 puDst->au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[5] - puSrc2->ai8[5]);
9111 puDst->au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[6] - puSrc2->ai8[6]);
9112 puDst->au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[7] - puSrc2->ai8[7]);
9113 puDst->au8[8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[8] - puSrc2->ai8[8]);
9114 puDst->au8[9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[9] - puSrc2->ai8[9]);
9115 puDst->au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[10] - puSrc2->ai8[10]);
9116 puDst->au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[11] - puSrc2->ai8[11]);
9117 puDst->au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[12] - puSrc2->ai8[12]);
9118 puDst->au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[13] - puSrc2->ai8[13]);
9119 puDst->au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[14] - puSrc2->ai8[14]);
9120 puDst->au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[15] - puSrc2->ai8[15]);
9121 puDst->au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[16] - puSrc2->ai8[16]);
9122 puDst->au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[17] - puSrc2->ai8[17]);
9123 puDst->au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[18] - puSrc2->ai8[18]);
9124 puDst->au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[19] - puSrc2->ai8[19]);
9125 puDst->au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[20] - puSrc2->ai8[20]);
9126 puDst->au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[21] - puSrc2->ai8[21]);
9127 puDst->au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[22] - puSrc2->ai8[22]);
9128 puDst->au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[23] - puSrc2->ai8[23]);
9129 puDst->au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[24] - puSrc2->ai8[24]);
9130 puDst->au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[25] - puSrc2->ai8[25]);
9131 puDst->au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[26] - puSrc2->ai8[26]);
9132 puDst->au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[27] - puSrc2->ai8[27]);
9133 puDst->au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[28] - puSrc2->ai8[28]);
9134 puDst->au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[29] - puSrc2->ai8[29]);
9135 puDst->au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[30] - puSrc2->ai8[30]);
9136 puDst->au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(puSrc1->ai8[31] - puSrc2->ai8[31]);
9137}
9138
9139
9140/*
9141 * PADDSB / VPADDSB
9142 */
9143#define SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(a_uWord) \
9144 ( (uint16_t)(a_uWord) <= (uint16_t)0xff \
9145 ? (uint8_t)(a_uWord) \
9146 : (uint8_t)0 )
9147
9148#ifdef IEM_WITHOUT_ASSEMBLY
9149
9150IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9151{
9152 RT_NOREF(pFpuState);
9153 RTUINT64U uSrc1 = { *puDst };
9154 RTUINT64U uSrc2 = { *puSrc };
9155 RTUINT64U uDst;
9156 uDst.au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - uSrc2.au8[0]);
9157 uDst.au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - uSrc2.au8[1]);
9158 uDst.au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - uSrc2.au8[2]);
9159 uDst.au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - uSrc2.au8[3]);
9160 uDst.au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - uSrc2.au8[4]);
9161 uDst.au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - uSrc2.au8[5]);
9162 uDst.au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - uSrc2.au8[6]);
9163 uDst.au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - uSrc2.au8[7]);
9164 *puDst = uDst.u;
9165}
9166
9167
9168IEM_DECL_IMPL_DEF(void, iemAImpl_psubusb_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9169{
9170 RT_NOREF(pFpuState);
9171 RTUINT128U uSrc1 = *puDst;
9172 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[0] - puSrc->au8[0]);
9173 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[1] - puSrc->au8[1]);
9174 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[2] - puSrc->au8[2]);
9175 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[3] - puSrc->au8[3]);
9176 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[4] - puSrc->au8[4]);
9177 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[5] - puSrc->au8[5]);
9178 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[6] - puSrc->au8[6]);
9179 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[7] - puSrc->au8[7]);
9180 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[8] - puSrc->au8[8]);
9181 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[9] - puSrc->au8[9]);
9182 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[10] - puSrc->au8[10]);
9183 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[11] - puSrc->au8[11]);
9184 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[12] - puSrc->au8[12]);
9185 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[13] - puSrc->au8[13]);
9186 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[14] - puSrc->au8[14]);
9187 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(uSrc1.au8[15] - puSrc->au8[15]);
9188}
9189
9190#endif
9191
9192IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u128_fallback,(PRTUINT128U puDst,
9193 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9194{
9195 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9196 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9197 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9198 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9199 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9200 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9201 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9202 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9203 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9204 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9205 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9206 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9207 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9208 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9209 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9210 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9211}
9212
9213IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusb_u256_fallback,(PRTUINT256U puDst,
9214 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9215{
9216 puDst->au8[0] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[0] - puSrc2->au8[0]);
9217 puDst->au8[1] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[1] - puSrc2->au8[1]);
9218 puDst->au8[2] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[2] - puSrc2->au8[2]);
9219 puDst->au8[3] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[3] - puSrc2->au8[3]);
9220 puDst->au8[4] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[4] - puSrc2->au8[4]);
9221 puDst->au8[5] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[5] - puSrc2->au8[5]);
9222 puDst->au8[6] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[6] - puSrc2->au8[6]);
9223 puDst->au8[7] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[7] - puSrc2->au8[7]);
9224 puDst->au8[8] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[8] - puSrc2->au8[8]);
9225 puDst->au8[9] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[9] - puSrc2->au8[9]);
9226 puDst->au8[10] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[10] - puSrc2->au8[10]);
9227 puDst->au8[11] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[11] - puSrc2->au8[11]);
9228 puDst->au8[12] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[12] - puSrc2->au8[12]);
9229 puDst->au8[13] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[13] - puSrc2->au8[13]);
9230 puDst->au8[14] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[14] - puSrc2->au8[14]);
9231 puDst->au8[15] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[15] - puSrc2->au8[15]);
9232 puDst->au8[16] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[16] - puSrc2->au8[16]);
9233 puDst->au8[17] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[17] - puSrc2->au8[17]);
9234 puDst->au8[18] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[18] - puSrc2->au8[18]);
9235 puDst->au8[19] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[19] - puSrc2->au8[19]);
9236 puDst->au8[20] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[20] - puSrc2->au8[20]);
9237 puDst->au8[21] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[21] - puSrc2->au8[21]);
9238 puDst->au8[22] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[22] - puSrc2->au8[22]);
9239 puDst->au8[23] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[23] - puSrc2->au8[23]);
9240 puDst->au8[24] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[24] - puSrc2->au8[24]);
9241 puDst->au8[25] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[25] - puSrc2->au8[25]);
9242 puDst->au8[26] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[26] - puSrc2->au8[26]);
9243 puDst->au8[27] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[27] - puSrc2->au8[27]);
9244 puDst->au8[28] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[28] - puSrc2->au8[28]);
9245 puDst->au8[29] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[29] - puSrc2->au8[29]);
9246 puDst->au8[30] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[30] - puSrc2->au8[30]);
9247 puDst->au8[31] = SATURATED_UNSIGNED_WORD_TO_UNSIGNED_BYTE_SUB(puSrc1->au8[31] - puSrc2->au8[31]);
9248}
9249
9250
9251/*
9252 * PSUBW / VPSUBW
9253 */
9254#ifdef IEM_WITHOUT_ASSEMBLY
9255
9256IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9257{
9258 RT_NOREF(pFpuState);
9259 RTUINT64U uSrc1 = { *puDst };
9260 RTUINT64U uSrc2 = { *puSrc };
9261 RTUINT64U uDst;
9262 uDst.au16[0] = uSrc1.au16[0] - uSrc2.au16[0];
9263 uDst.au16[1] = uSrc1.au16[1] - uSrc2.au16[1];
9264 uDst.au16[2] = uSrc1.au16[2] - uSrc2.au16[2];
9265 uDst.au16[3] = uSrc1.au16[3] - uSrc2.au16[3];
9266 *puDst = uDst.u;
9267}
9268
9269
9270IEM_DECL_IMPL_DEF(void, iemAImpl_psubw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9271{
9272 RT_NOREF(pFpuState);
9273 RTUINT128U uSrc1 = *puDst;
9274 puDst->au16[0] = uSrc1.au16[0] - puSrc->au16[0];
9275 puDst->au16[1] = uSrc1.au16[1] - puSrc->au16[1];
9276 puDst->au16[2] = uSrc1.au16[2] - puSrc->au16[2];
9277 puDst->au16[3] = uSrc1.au16[3] - puSrc->au16[3];
9278 puDst->au16[4] = uSrc1.au16[4] - puSrc->au16[4];
9279 puDst->au16[5] = uSrc1.au16[5] - puSrc->au16[5];
9280 puDst->au16[6] = uSrc1.au16[6] - puSrc->au16[6];
9281 puDst->au16[7] = uSrc1.au16[7] - puSrc->au16[7];
9282}
9283
9284#endif
9285
9286IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9287 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9288{
9289 RT_NOREF(pExtState);
9290 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9291 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9292 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9293 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9294 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9295 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9296 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9297 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9298}
9299
9300IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9301 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9302{
9303 RT_NOREF(pExtState);
9304 puDst->au16[0] = puSrc1->au16[0] - puSrc2->au16[0];
9305 puDst->au16[1] = puSrc1->au16[1] - puSrc2->au16[1];
9306 puDst->au16[2] = puSrc1->au16[2] - puSrc2->au16[2];
9307 puDst->au16[3] = puSrc1->au16[3] - puSrc2->au16[3];
9308 puDst->au16[4] = puSrc1->au16[4] - puSrc2->au16[4];
9309 puDst->au16[5] = puSrc1->au16[5] - puSrc2->au16[5];
9310 puDst->au16[6] = puSrc1->au16[6] - puSrc2->au16[6];
9311 puDst->au16[7] = puSrc1->au16[7] - puSrc2->au16[7];
9312 puDst->au16[8] = puSrc1->au16[8] - puSrc2->au16[8];
9313 puDst->au16[9] = puSrc1->au16[9] - puSrc2->au16[9];
9314 puDst->au16[10] = puSrc1->au16[10] - puSrc2->au16[10];
9315 puDst->au16[11] = puSrc1->au16[11] - puSrc2->au16[11];
9316 puDst->au16[12] = puSrc1->au16[12] - puSrc2->au16[12];
9317 puDst->au16[13] = puSrc1->au16[13] - puSrc2->au16[13];
9318 puDst->au16[14] = puSrc1->au16[14] - puSrc2->au16[14];
9319 puDst->au16[15] = puSrc1->au16[15] - puSrc2->au16[15];
9320}
9321
9322
9323/*
9324 * PSUBSW / VPSUBSW
9325 */
9326#ifdef IEM_WITHOUT_ASSEMBLY
9327
9328IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9329{
9330 RT_NOREF(pFpuState);
9331 RTUINT64U uSrc1 = { *puDst };
9332 RTUINT64U uSrc2 = { *puSrc };
9333 RTUINT64U uDst;
9334 uDst.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc2.ai16[0]);
9335 uDst.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - uSrc2.ai16[1]);
9336 uDst.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc2.ai16[2]);
9337 uDst.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - uSrc2.ai16[3]);
9338 *puDst = uDst.u;
9339}
9340
9341
9342IEM_DECL_IMPL_DEF(void, iemAImpl_psubsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9343{
9344 RT_NOREF(pFpuState);
9345 RTUINT128U uSrc1 = *puDst;
9346 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - puSrc->ai16[0]);
9347 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[1] - puSrc->ai16[1]);
9348 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - puSrc->ai16[2]);
9349 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[3] - puSrc->ai16[3]);
9350 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - puSrc->ai16[4]);
9351 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[5] - puSrc->ai16[5]);
9352 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - puSrc->ai16[6]);
9353 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[7] - puSrc->ai16[7]);
9354}
9355
9356#endif
9357
9358IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u128_fallback,(PRTUINT128U puDst,
9359 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9360{
9361 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9362 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9363 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9364 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9365 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9366 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9367 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9368 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9369}
9370
9371IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubsw_u256_fallback,(PRTUINT256U puDst,
9372 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9373{
9374 puDst->au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc2->ai16[0]);
9375 puDst->au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[1] - puSrc2->ai16[1]);
9376 puDst->au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc2->ai16[2]);
9377 puDst->au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[3] - puSrc2->ai16[3]);
9378 puDst->au16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc2->ai16[4]);
9379 puDst->au16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[5] - puSrc2->ai16[5]);
9380 puDst->au16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc2->ai16[6]);
9381 puDst->au16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[7] - puSrc2->ai16[7]);
9382 puDst->au16[8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[8] - puSrc2->ai16[8]);
9383 puDst->au16[9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[9] - puSrc2->ai16[9]);
9384 puDst->au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc2->ai16[10]);
9385 puDst->au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[11] - puSrc2->ai16[11]);
9386 puDst->au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc2->ai16[12]);
9387 puDst->au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[13] - puSrc2->ai16[13]);
9388 puDst->au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc2->ai16[14]);
9389 puDst->au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[15] - puSrc2->ai16[15]);
9390}
9391
9392
9393/*
9394 * PSUBUSW / VPSUBUSW
9395 */
9396#define SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(a_uDword) \
9397 ( (uint32_t)(a_uDword) <= (uint16_t)0xffff \
9398 ? (uint16_t)(a_uDword) \
9399 : (uint16_t)0 )
9400
9401#ifdef IEM_WITHOUT_ASSEMBLY
9402
9403IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9404{
9405 RT_NOREF(pFpuState);
9406 RTUINT64U uSrc1 = { *puDst };
9407 RTUINT64U uSrc2 = { *puSrc };
9408 RTUINT64U uDst;
9409 uDst.au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - uSrc2.au16[0]);
9410 uDst.au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - uSrc2.au16[1]);
9411 uDst.au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - uSrc2.au16[2]);
9412 uDst.au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - uSrc2.au16[3]);
9413 *puDst = uDst.u;
9414}
9415
9416
9417IEM_DECL_IMPL_DEF(void, iemAImpl_psubusw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9418{
9419 RT_NOREF(pFpuState);
9420 RTUINT128U uSrc1 = *puDst;
9421 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[0] - puSrc->au16[0]);
9422 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[1] - puSrc->au16[1]);
9423 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[2] - puSrc->au16[2]);
9424 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[3] - puSrc->au16[3]);
9425 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[4] - puSrc->au16[4]);
9426 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[5] - puSrc->au16[5]);
9427 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[6] - puSrc->au16[6]);
9428 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(uSrc1.au16[7] - puSrc->au16[7]);
9429}
9430
9431#endif
9432
9433IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u128_fallback,(PRTUINT128U puDst,
9434 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9435{
9436 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9437 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9438 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9439 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9440 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9441 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9442 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9443 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9444}
9445
9446IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubusw_u256_fallback,(PRTUINT256U puDst,
9447 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9448{
9449 puDst->au16[0] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[0] - puSrc2->au16[0]);
9450 puDst->au16[1] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[1] - puSrc2->au16[1]);
9451 puDst->au16[2] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[2] - puSrc2->au16[2]);
9452 puDst->au16[3] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[3] - puSrc2->au16[3]);
9453 puDst->au16[4] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[4] - puSrc2->au16[4]);
9454 puDst->au16[5] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[5] - puSrc2->au16[5]);
9455 puDst->au16[6] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[6] - puSrc2->au16[6]);
9456 puDst->au16[7] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[7] - puSrc2->au16[7]);
9457 puDst->au16[8] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[8] - puSrc2->au16[8]);
9458 puDst->au16[9] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[9] - puSrc2->au16[9]);
9459 puDst->au16[10] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[10] - puSrc2->au16[10]);
9460 puDst->au16[11] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[11] - puSrc2->au16[11]);
9461 puDst->au16[12] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[12] - puSrc2->au16[12]);
9462 puDst->au16[13] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[13] - puSrc2->au16[13]);
9463 puDst->au16[14] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[14] - puSrc2->au16[14]);
9464 puDst->au16[15] = SATURATED_UNSIGNED_DWORD_TO_UNSIGNED_WORD_SUB(puSrc1->au16[15] - puSrc2->au16[15]);
9465}
9466
9467
9468
9469/*
9470 * PSUBD / VPSUBD.
9471 */
9472#ifdef IEM_WITHOUT_ASSEMBLY
9473
9474IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9475{
9476 RT_NOREF(pFpuState);
9477 RTUINT64U uSrc1 = { *puDst };
9478 RTUINT64U uSrc2 = { *puSrc };
9479 RTUINT64U uDst;
9480 uDst.au32[0] = uSrc1.au32[0] - uSrc2.au32[0];
9481 uDst.au32[1] = uSrc1.au32[1] - uSrc2.au32[1];
9482 *puDst = uDst.u;
9483}
9484
9485
9486IEM_DECL_IMPL_DEF(void, iemAImpl_psubd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9487{
9488 RT_NOREF(pFpuState);
9489 RTUINT128U uSrc1 = *puDst;
9490 puDst->au32[0] = uSrc1.au32[0] - puSrc->au32[0];
9491 puDst->au32[1] = uSrc1.au32[1] - puSrc->au32[1];
9492 puDst->au32[2] = uSrc1.au32[2] - puSrc->au32[2];
9493 puDst->au32[3] = uSrc1.au32[3] - puSrc->au32[3];
9494}
9495
9496#endif /* IEM_WITHOUT_ASSEMBLY */
9497
9498IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9499 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9500{
9501 RT_NOREF(pExtState);
9502 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9503 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9504 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9505 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9506}
9507
9508IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9509 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9510{
9511 RT_NOREF(pExtState);
9512 puDst->au32[0] = puSrc1->au32[0] - puSrc2->au32[0];
9513 puDst->au32[1] = puSrc1->au32[1] - puSrc2->au32[1];
9514 puDst->au32[2] = puSrc1->au32[2] - puSrc2->au32[2];
9515 puDst->au32[3] = puSrc1->au32[3] - puSrc2->au32[3];
9516 puDst->au32[4] = puSrc1->au32[4] - puSrc2->au32[4];
9517 puDst->au32[5] = puSrc1->au32[5] - puSrc2->au32[5];
9518 puDst->au32[6] = puSrc1->au32[6] - puSrc2->au32[6];
9519 puDst->au32[7] = puSrc1->au32[7] - puSrc2->au32[7];
9520}
9521
9522
9523/*
9524 * PSUBQ / VPSUBQ.
9525 */
9526#ifdef IEM_WITHOUT_ASSEMBLY
9527
9528IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9529{
9530 RT_NOREF(pFpuState);
9531 *puDst = *puDst - *puSrc;
9532}
9533
9534IEM_DECL_IMPL_DEF(void, iemAImpl_psubq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9535{
9536 RT_NOREF(pFpuState);
9537 RTUINT128U uSrc1 = *puDst;
9538 puDst->au64[0] = uSrc1.au64[0] - puSrc->au64[0];
9539 puDst->au64[1] = uSrc1.au64[1] - puSrc->au64[1];
9540}
9541
9542#endif
9543
9544IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
9545 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9546{
9547 RT_NOREF(pExtState);
9548 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9549 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9550}
9551
9552IEM_DECL_IMPL_DEF(void, iemAImpl_vpsubq_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
9553 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9554{
9555 RT_NOREF(pExtState);
9556 puDst->au64[0] = puSrc1->au64[0] - puSrc2->au64[0];
9557 puDst->au64[1] = puSrc1->au64[1] - puSrc2->au64[1];
9558 puDst->au64[2] = puSrc1->au64[2] - puSrc2->au64[2];
9559 puDst->au64[3] = puSrc1->au64[3] - puSrc2->au64[3];
9560}
9561
9562
9563
9564/*
9565 * PMULLW / VPMULLW / PMULLD / VPMULLD
9566 */
9567#ifdef IEM_WITHOUT_ASSEMBLY
9568
9569IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9570{
9571 RT_NOREF(pFpuState);
9572 RTUINT64U uSrc1 = { *puDst };
9573 RTUINT64U uSrc2 = { *puSrc };
9574 RTUINT64U uDst;
9575 uDst.ai16[0] = uSrc1.ai16[0] * uSrc2.ai16[0];
9576 uDst.ai16[1] = uSrc1.ai16[1] * uSrc2.ai16[1];
9577 uDst.ai16[2] = uSrc1.ai16[2] * uSrc2.ai16[2];
9578 uDst.ai16[3] = uSrc1.ai16[3] * uSrc2.ai16[3];
9579 *puDst = uDst.u;
9580}
9581
9582
9583IEM_DECL_IMPL_DEF(void, iemAImpl_pmullw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9584{
9585 RT_NOREF(pFpuState);
9586 RTUINT128U uSrc1 = *puDst;
9587 puDst->ai16[0] = uSrc1.ai16[0] * puSrc->ai16[0];
9588 puDst->ai16[1] = uSrc1.ai16[1] * puSrc->ai16[1];
9589 puDst->ai16[2] = uSrc1.ai16[2] * puSrc->ai16[2];
9590 puDst->ai16[3] = uSrc1.ai16[3] * puSrc->ai16[3];
9591 puDst->ai16[4] = uSrc1.ai16[4] * puSrc->ai16[4];
9592 puDst->ai16[5] = uSrc1.ai16[5] * puSrc->ai16[5];
9593 puDst->ai16[6] = uSrc1.ai16[6] * puSrc->ai16[6];
9594 puDst->ai16[7] = uSrc1.ai16[7] * puSrc->ai16[7];
9595}
9596
9597#endif
9598
9599IEM_DECL_IMPL_DEF(void, iemAImpl_pmulld_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9600{
9601 RTUINT128U uSrc1 = *puDst;
9602
9603 puDst->ai32[0] = uSrc1.ai32[0] * puSrc->ai32[0];
9604 puDst->ai32[1] = uSrc1.ai32[1] * puSrc->ai32[1];
9605 puDst->ai32[2] = uSrc1.ai32[2] * puSrc->ai32[2];
9606 puDst->ai32[3] = uSrc1.ai32[3] * puSrc->ai32[3];
9607 RT_NOREF(pFpuState);
9608}
9609
9610
9611IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9612{
9613 puDst->ai16[0] = puSrc1->ai16[0] * puSrc2->ai16[0];
9614 puDst->ai16[1] = puSrc1->ai16[1] * puSrc2->ai16[1];
9615 puDst->ai16[2] = puSrc1->ai16[2] * puSrc2->ai16[2];
9616 puDst->ai16[3] = puSrc1->ai16[3] * puSrc2->ai16[3];
9617 puDst->ai16[4] = puSrc1->ai16[4] * puSrc2->ai16[4];
9618 puDst->ai16[5] = puSrc1->ai16[5] * puSrc2->ai16[5];
9619 puDst->ai16[6] = puSrc1->ai16[6] * puSrc2->ai16[6];
9620 puDst->ai16[7] = puSrc1->ai16[7] * puSrc2->ai16[7];
9621}
9622
9623
9624IEM_DECL_IMPL_DEF(void, iemAImpl_vpmullw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9625{
9626 puDst->ai16[ 0] = puSrc1->ai16[ 0] * puSrc2->ai16[ 0];
9627 puDst->ai16[ 1] = puSrc1->ai16[ 1] * puSrc2->ai16[ 1];
9628 puDst->ai16[ 2] = puSrc1->ai16[ 2] * puSrc2->ai16[ 2];
9629 puDst->ai16[ 3] = puSrc1->ai16[ 3] * puSrc2->ai16[ 3];
9630 puDst->ai16[ 4] = puSrc1->ai16[ 4] * puSrc2->ai16[ 4];
9631 puDst->ai16[ 5] = puSrc1->ai16[ 5] * puSrc2->ai16[ 5];
9632 puDst->ai16[ 6] = puSrc1->ai16[ 6] * puSrc2->ai16[ 6];
9633 puDst->ai16[ 7] = puSrc1->ai16[ 7] * puSrc2->ai16[ 7];
9634 puDst->ai16[ 8] = puSrc1->ai16[ 8] * puSrc2->ai16[ 8];
9635 puDst->ai16[ 9] = puSrc1->ai16[ 9] * puSrc2->ai16[ 9];
9636 puDst->ai16[10] = puSrc1->ai16[10] * puSrc2->ai16[10];
9637 puDst->ai16[11] = puSrc1->ai16[11] * puSrc2->ai16[11];
9638 puDst->ai16[12] = puSrc1->ai16[12] * puSrc2->ai16[12];
9639 puDst->ai16[13] = puSrc1->ai16[13] * puSrc2->ai16[13];
9640 puDst->ai16[14] = puSrc1->ai16[14] * puSrc2->ai16[14];
9641 puDst->ai16[15] = puSrc1->ai16[15] * puSrc2->ai16[15];
9642}
9643
9644
9645IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9646{
9647 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9648 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9649 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9650 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9651}
9652
9653
9654IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulld_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9655{
9656 puDst->ai32[0] = puSrc1->ai32[0] * puSrc2->ai32[0];
9657 puDst->ai32[1] = puSrc1->ai32[1] * puSrc2->ai32[1];
9658 puDst->ai32[2] = puSrc1->ai32[2] * puSrc2->ai32[2];
9659 puDst->ai32[3] = puSrc1->ai32[3] * puSrc2->ai32[3];
9660 puDst->ai32[4] = puSrc1->ai32[4] * puSrc2->ai32[4];
9661 puDst->ai32[5] = puSrc1->ai32[5] * puSrc2->ai32[5];
9662 puDst->ai32[6] = puSrc1->ai32[6] * puSrc2->ai32[6];
9663 puDst->ai32[7] = puSrc1->ai32[7] * puSrc2->ai32[7];
9664}
9665
9666
9667/*
9668 * PMULHW / VPMULHW
9669 */
9670#ifdef IEM_WITHOUT_ASSEMBLY
9671
9672IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
9673{
9674 RT_NOREF(pFpuState);
9675 RTUINT64U uSrc1 = { *puDst };
9676 RTUINT64U uSrc2 = { *puSrc };
9677 RTUINT64U uDst;
9678 uDst.ai16[0] = RT_HIWORD(uSrc1.ai16[0] * uSrc2.ai16[0]);
9679 uDst.ai16[1] = RT_HIWORD(uSrc1.ai16[1] * uSrc2.ai16[1]);
9680 uDst.ai16[2] = RT_HIWORD(uSrc1.ai16[2] * uSrc2.ai16[2]);
9681 uDst.ai16[3] = RT_HIWORD(uSrc1.ai16[3] * uSrc2.ai16[3]);
9682 *puDst = uDst.u;
9683}
9684
9685
9686IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
9687{
9688 RT_NOREF(pFpuState);
9689 RTUINT128U uSrc1 = *puDst;
9690 puDst->ai16[0] = RT_HIWORD(uSrc1.ai16[0] * puSrc->ai16[0]);
9691 puDst->ai16[1] = RT_HIWORD(uSrc1.ai16[1] * puSrc->ai16[1]);
9692 puDst->ai16[2] = RT_HIWORD(uSrc1.ai16[2] * puSrc->ai16[2]);
9693 puDst->ai16[3] = RT_HIWORD(uSrc1.ai16[3] * puSrc->ai16[3]);
9694 puDst->ai16[4] = RT_HIWORD(uSrc1.ai16[4] * puSrc->ai16[4]);
9695 puDst->ai16[5] = RT_HIWORD(uSrc1.ai16[5] * puSrc->ai16[5]);
9696 puDst->ai16[6] = RT_HIWORD(uSrc1.ai16[6] * puSrc->ai16[6]);
9697 puDst->ai16[7] = RT_HIWORD(uSrc1.ai16[7] * puSrc->ai16[7]);
9698}
9699
9700#endif
9701
9702IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9703{
9704 puDst->ai16[0] = RT_HIWORD(puSrc1->ai16[0] * puSrc2->ai16[0]);
9705 puDst->ai16[1] = RT_HIWORD(puSrc1->ai16[1] * puSrc2->ai16[1]);
9706 puDst->ai16[2] = RT_HIWORD(puSrc1->ai16[2] * puSrc2->ai16[2]);
9707 puDst->ai16[3] = RT_HIWORD(puSrc1->ai16[3] * puSrc2->ai16[3]);
9708 puDst->ai16[4] = RT_HIWORD(puSrc1->ai16[4] * puSrc2->ai16[4]);
9709 puDst->ai16[5] = RT_HIWORD(puSrc1->ai16[5] * puSrc2->ai16[5]);
9710 puDst->ai16[6] = RT_HIWORD(puSrc1->ai16[6] * puSrc2->ai16[6]);
9711 puDst->ai16[7] = RT_HIWORD(puSrc1->ai16[7] * puSrc2->ai16[7]);
9712}
9713
9714
9715IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9716{
9717 puDst->ai16[ 0] = RT_HIWORD(puSrc1->ai16[ 0] * puSrc2->ai16[ 0]);
9718 puDst->ai16[ 1] = RT_HIWORD(puSrc1->ai16[ 1] * puSrc2->ai16[ 1]);
9719 puDst->ai16[ 2] = RT_HIWORD(puSrc1->ai16[ 2] * puSrc2->ai16[ 2]);
9720 puDst->ai16[ 3] = RT_HIWORD(puSrc1->ai16[ 3] * puSrc2->ai16[ 3]);
9721 puDst->ai16[ 4] = RT_HIWORD(puSrc1->ai16[ 4] * puSrc2->ai16[ 4]);
9722 puDst->ai16[ 5] = RT_HIWORD(puSrc1->ai16[ 5] * puSrc2->ai16[ 5]);
9723 puDst->ai16[ 6] = RT_HIWORD(puSrc1->ai16[ 6] * puSrc2->ai16[ 6]);
9724 puDst->ai16[ 7] = RT_HIWORD(puSrc1->ai16[ 7] * puSrc2->ai16[ 7]);
9725 puDst->ai16[ 8] = RT_HIWORD(puSrc1->ai16[ 8] * puSrc2->ai16[ 8]);
9726 puDst->ai16[ 9] = RT_HIWORD(puSrc1->ai16[ 9] * puSrc2->ai16[ 9]);
9727 puDst->ai16[10] = RT_HIWORD(puSrc1->ai16[10] * puSrc2->ai16[10]);
9728 puDst->ai16[11] = RT_HIWORD(puSrc1->ai16[11] * puSrc2->ai16[11]);
9729 puDst->ai16[12] = RT_HIWORD(puSrc1->ai16[12] * puSrc2->ai16[12]);
9730 puDst->ai16[13] = RT_HIWORD(puSrc1->ai16[13] * puSrc2->ai16[13]);
9731 puDst->ai16[14] = RT_HIWORD(puSrc1->ai16[14] * puSrc2->ai16[14]);
9732 puDst->ai16[15] = RT_HIWORD(puSrc1->ai16[15] * puSrc2->ai16[15]);
9733}
9734
9735
9736/*
9737 * PMULHUW / VPMULHUW
9738 */
9739#ifdef IEM_WITHOUT_ASSEMBLY
9740
9741IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9742{
9743 RTUINT64U uSrc1 = { *puDst };
9744 RTUINT64U uSrc2 = { *puSrc };
9745 RTUINT64U uDst;
9746 uDst.au16[0] = RT_HIWORD(uSrc1.au16[0] * uSrc2.au16[0]);
9747 uDst.au16[1] = RT_HIWORD(uSrc1.au16[1] * uSrc2.au16[1]);
9748 uDst.au16[2] = RT_HIWORD(uSrc1.au16[2] * uSrc2.au16[2]);
9749 uDst.au16[3] = RT_HIWORD(uSrc1.au16[3] * uSrc2.au16[3]);
9750 *puDst = uDst.u;
9751}
9752
9753
9754IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhuw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9755{
9756 RTUINT128U uSrc1 = *puDst;
9757 puDst->au16[0] = RT_HIWORD(uSrc1.au16[0] * puSrc->au16[0]);
9758 puDst->au16[1] = RT_HIWORD(uSrc1.au16[1] * puSrc->au16[1]);
9759 puDst->au16[2] = RT_HIWORD(uSrc1.au16[2] * puSrc->au16[2]);
9760 puDst->au16[3] = RT_HIWORD(uSrc1.au16[3] * puSrc->au16[3]);
9761 puDst->au16[4] = RT_HIWORD(uSrc1.au16[4] * puSrc->au16[4]);
9762 puDst->au16[5] = RT_HIWORD(uSrc1.au16[5] * puSrc->au16[5]);
9763 puDst->au16[6] = RT_HIWORD(uSrc1.au16[6] * puSrc->au16[6]);
9764 puDst->au16[7] = RT_HIWORD(uSrc1.au16[7] * puSrc->au16[7]);
9765}
9766
9767#endif
9768
9769IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
9770{
9771 puDst->au16[0] = RT_HIWORD(puSrc1->au16[0] * puSrc2->au16[0]);
9772 puDst->au16[1] = RT_HIWORD(puSrc1->au16[1] * puSrc2->au16[1]);
9773 puDst->au16[2] = RT_HIWORD(puSrc1->au16[2] * puSrc2->au16[2]);
9774 puDst->au16[3] = RT_HIWORD(puSrc1->au16[3] * puSrc2->au16[3]);
9775 puDst->au16[4] = RT_HIWORD(puSrc1->au16[4] * puSrc2->au16[4]);
9776 puDst->au16[5] = RT_HIWORD(puSrc1->au16[5] * puSrc2->au16[5]);
9777 puDst->au16[6] = RT_HIWORD(puSrc1->au16[6] * puSrc2->au16[6]);
9778 puDst->au16[7] = RT_HIWORD(puSrc1->au16[7] * puSrc2->au16[7]);
9779}
9780
9781
9782IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhuw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
9783{
9784 puDst->au16[ 0] = RT_HIWORD(puSrc1->au16[ 0] * puSrc2->au16[ 0]);
9785 puDst->au16[ 1] = RT_HIWORD(puSrc1->au16[ 1] * puSrc2->au16[ 1]);
9786 puDst->au16[ 2] = RT_HIWORD(puSrc1->au16[ 2] * puSrc2->au16[ 2]);
9787 puDst->au16[ 3] = RT_HIWORD(puSrc1->au16[ 3] * puSrc2->au16[ 3]);
9788 puDst->au16[ 4] = RT_HIWORD(puSrc1->au16[ 4] * puSrc2->au16[ 4]);
9789 puDst->au16[ 5] = RT_HIWORD(puSrc1->au16[ 5] * puSrc2->au16[ 5]);
9790 puDst->au16[ 6] = RT_HIWORD(puSrc1->au16[ 6] * puSrc2->au16[ 6]);
9791 puDst->au16[ 7] = RT_HIWORD(puSrc1->au16[ 7] * puSrc2->au16[ 7]);
9792 puDst->au16[ 8] = RT_HIWORD(puSrc1->au16[ 8] * puSrc2->au16[ 8]);
9793 puDst->au16[ 9] = RT_HIWORD(puSrc1->au16[ 9] * puSrc2->au16[ 9]);
9794 puDst->au16[10] = RT_HIWORD(puSrc1->au16[10] * puSrc2->au16[10]);
9795 puDst->au16[11] = RT_HIWORD(puSrc1->au16[11] * puSrc2->au16[11]);
9796 puDst->au16[12] = RT_HIWORD(puSrc1->au16[12] * puSrc2->au16[12]);
9797 puDst->au16[13] = RT_HIWORD(puSrc1->au16[13] * puSrc2->au16[13]);
9798 puDst->au16[14] = RT_HIWORD(puSrc1->au16[14] * puSrc2->au16[14]);
9799 puDst->au16[15] = RT_HIWORD(puSrc1->au16[15] * puSrc2->au16[15]);
9800}
9801
9802
9803/*
9804 * PSRLW / VPSRLW
9805 */
9806#ifdef IEM_WITHOUT_ASSEMBLY
9807
9808IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9809{
9810 RTUINT64U uSrc1 = { *puDst };
9811 RTUINT64U uSrc2 = { *puSrc };
9812 RTUINT64U uDst;
9813
9814 if (uSrc2.au64[0] <= 15)
9815 {
9816 uDst.au16[0] = uSrc1.au16[0] >> uSrc2.au8[0];
9817 uDst.au16[1] = uSrc1.au16[1] >> uSrc2.au8[0];
9818 uDst.au16[2] = uSrc1.au16[2] >> uSrc2.au8[0];
9819 uDst.au16[3] = uSrc1.au16[3] >> uSrc2.au8[0];
9820 }
9821 else
9822 {
9823 uDst.au64[0] = 0;
9824 }
9825 *puDst = uDst.u;
9826}
9827
9828
9829IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9830{
9831 RTUINT64U uSrc1 = { *puDst };
9832 RTUINT64U uDst;
9833
9834 if (uShift <= 15)
9835 {
9836 uDst.au16[0] = uSrc1.au16[0] >> uShift;
9837 uDst.au16[1] = uSrc1.au16[1] >> uShift;
9838 uDst.au16[2] = uSrc1.au16[2] >> uShift;
9839 uDst.au16[3] = uSrc1.au16[3] >> uShift;
9840 }
9841 else
9842 {
9843 uDst.au64[0] = 0;
9844 }
9845 *puDst = uDst.u;
9846}
9847
9848
9849IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9850{
9851 RTUINT128U uSrc1 = *puDst;
9852
9853 if (puSrc->au64[0] <= 15)
9854 {
9855 puDst->au16[0] = uSrc1.au16[0] >> puSrc->au8[0];
9856 puDst->au16[1] = uSrc1.au16[1] >> puSrc->au8[0];
9857 puDst->au16[2] = uSrc1.au16[2] >> puSrc->au8[0];
9858 puDst->au16[3] = uSrc1.au16[3] >> puSrc->au8[0];
9859 puDst->au16[4] = uSrc1.au16[4] >> puSrc->au8[0];
9860 puDst->au16[5] = uSrc1.au16[5] >> puSrc->au8[0];
9861 puDst->au16[6] = uSrc1.au16[6] >> puSrc->au8[0];
9862 puDst->au16[7] = uSrc1.au16[7] >> puSrc->au8[0];
9863 }
9864 else
9865 {
9866 puDst->au64[0] = 0;
9867 puDst->au64[1] = 0;
9868 }
9869}
9870
9871IEM_DECL_IMPL_DEF(void, iemAImpl_psrlw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9872{
9873 RTUINT128U uSrc1 = *puDst;
9874
9875 if (uShift <= 15)
9876 {
9877 puDst->au16[0] = uSrc1.au16[0] >> uShift;
9878 puDst->au16[1] = uSrc1.au16[1] >> uShift;
9879 puDst->au16[2] = uSrc1.au16[2] >> uShift;
9880 puDst->au16[3] = uSrc1.au16[3] >> uShift;
9881 puDst->au16[4] = uSrc1.au16[4] >> uShift;
9882 puDst->au16[5] = uSrc1.au16[5] >> uShift;
9883 puDst->au16[6] = uSrc1.au16[6] >> uShift;
9884 puDst->au16[7] = uSrc1.au16[7] >> uShift;
9885 }
9886 else
9887 {
9888 puDst->au64[0] = 0;
9889 puDst->au64[1] = 0;
9890 }
9891}
9892
9893#endif
9894
9895
9896/*
9897 * PSRAW / VPSRAW
9898 */
9899#ifdef IEM_WITHOUT_ASSEMBLY
9900
9901IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9902{
9903 RTUINT64U uSrc1 = { *puDst };
9904 RTUINT64U uSrc2 = { *puSrc };
9905 RTUINT64U uDst;
9906
9907 if (uSrc2.au64[0] <= 15)
9908 {
9909 uDst.ai16[0] = uSrc1.ai16[0] >> uSrc2.au8[0];
9910 uDst.ai16[1] = uSrc1.ai16[1] >> uSrc2.au8[0];
9911 uDst.ai16[2] = uSrc1.ai16[2] >> uSrc2.au8[0];
9912 uDst.ai16[3] = uSrc1.ai16[3] >> uSrc2.au8[0];
9913 }
9914 else
9915 {
9916 uDst.au64[0] = 0;
9917 }
9918 *puDst = uDst.u;
9919}
9920
9921
9922IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u64,(uint64_t *puDst, uint8_t uShift))
9923{
9924 RTUINT64U uSrc1 = { *puDst };
9925 RTUINT64U uDst;
9926
9927 if (uShift <= 15)
9928 {
9929 uDst.ai16[0] = uSrc1.ai16[0] >> uShift;
9930 uDst.ai16[1] = uSrc1.ai16[1] >> uShift;
9931 uDst.ai16[2] = uSrc1.ai16[2] >> uShift;
9932 uDst.ai16[3] = uSrc1.ai16[3] >> uShift;
9933 }
9934 else
9935 {
9936 uDst.au64[0] = 0;
9937 }
9938 *puDst = uDst.u;
9939}
9940
9941
9942IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
9943{
9944 RTUINT128U uSrc1 = *puDst;
9945
9946 if (puSrc->au64[0] <= 15)
9947 {
9948 puDst->ai16[0] = uSrc1.ai16[0] >> puSrc->au8[0];
9949 puDst->ai16[1] = uSrc1.ai16[1] >> puSrc->au8[0];
9950 puDst->ai16[2] = uSrc1.ai16[2] >> puSrc->au8[0];
9951 puDst->ai16[3] = uSrc1.ai16[3] >> puSrc->au8[0];
9952 puDst->ai16[4] = uSrc1.ai16[4] >> puSrc->au8[0];
9953 puDst->ai16[5] = uSrc1.ai16[5] >> puSrc->au8[0];
9954 puDst->ai16[6] = uSrc1.ai16[6] >> puSrc->au8[0];
9955 puDst->ai16[7] = uSrc1.ai16[7] >> puSrc->au8[0];
9956 }
9957 else
9958 {
9959 puDst->au64[0] = 0;
9960 puDst->au64[1] = 0;
9961 }
9962}
9963
9964IEM_DECL_IMPL_DEF(void, iemAImpl_psraw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
9965{
9966 RTUINT128U uSrc1 = *puDst;
9967
9968 if (uShift <= 15)
9969 {
9970 puDst->ai16[0] = uSrc1.ai16[0] >> uShift;
9971 puDst->ai16[1] = uSrc1.ai16[1] >> uShift;
9972 puDst->ai16[2] = uSrc1.ai16[2] >> uShift;
9973 puDst->ai16[3] = uSrc1.ai16[3] >> uShift;
9974 puDst->ai16[4] = uSrc1.ai16[4] >> uShift;
9975 puDst->ai16[5] = uSrc1.ai16[5] >> uShift;
9976 puDst->ai16[6] = uSrc1.ai16[6] >> uShift;
9977 puDst->ai16[7] = uSrc1.ai16[7] >> uShift;
9978 }
9979 else
9980 {
9981 puDst->au64[0] = 0;
9982 puDst->au64[1] = 0;
9983 }
9984}
9985
9986#endif
9987
9988
9989/*
9990 * PSLLW / VPSLLW
9991 */
9992#ifdef IEM_WITHOUT_ASSEMBLY
9993
9994IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u64,(uint64_t *puDst, uint64_t const *puSrc))
9995{
9996 RTUINT64U uSrc1 = { *puDst };
9997 RTUINT64U uSrc2 = { *puSrc };
9998 RTUINT64U uDst;
9999
10000 if (uSrc2.au64[0] <= 15)
10001 {
10002 uDst.au16[0] = uSrc1.au16[0] << uSrc2.au8[0];
10003 uDst.au16[1] = uSrc1.au16[1] << uSrc2.au8[0];
10004 uDst.au16[2] = uSrc1.au16[2] << uSrc2.au8[0];
10005 uDst.au16[3] = uSrc1.au16[3] << uSrc2.au8[0];
10006 }
10007 else
10008 {
10009 uDst.au64[0] = 0;
10010 }
10011 *puDst = uDst.u;
10012}
10013
10014
10015IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u64,(uint64_t *puDst, uint8_t uShift))
10016{
10017 RTUINT64U uSrc1 = { *puDst };
10018 RTUINT64U uDst;
10019
10020 if (uShift <= 15)
10021 {
10022 uDst.au16[0] = uSrc1.au16[0] << uShift;
10023 uDst.au16[1] = uSrc1.au16[1] << uShift;
10024 uDst.au16[2] = uSrc1.au16[2] << uShift;
10025 uDst.au16[3] = uSrc1.au16[3] << uShift;
10026 }
10027 else
10028 {
10029 uDst.au64[0] = 0;
10030 }
10031 *puDst = uDst.u;
10032}
10033
10034
10035IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10036{
10037 RTUINT128U uSrc1 = *puDst;
10038
10039 if (puSrc->au64[0] <= 15)
10040 {
10041 puDst->au16[0] = uSrc1.au16[0] << puSrc->au8[0];
10042 puDst->au16[1] = uSrc1.au16[1] << puSrc->au8[0];
10043 puDst->au16[2] = uSrc1.au16[2] << puSrc->au8[0];
10044 puDst->au16[3] = uSrc1.au16[3] << puSrc->au8[0];
10045 puDst->au16[4] = uSrc1.au16[4] << puSrc->au8[0];
10046 puDst->au16[5] = uSrc1.au16[5] << puSrc->au8[0];
10047 puDst->au16[6] = uSrc1.au16[6] << puSrc->au8[0];
10048 puDst->au16[7] = uSrc1.au16[7] << puSrc->au8[0];
10049 }
10050 else
10051 {
10052 puDst->au64[0] = 0;
10053 puDst->au64[1] = 0;
10054 }
10055}
10056
10057IEM_DECL_IMPL_DEF(void, iemAImpl_psllw_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10058{
10059 RTUINT128U uSrc1 = *puDst;
10060
10061 if (uShift <= 15)
10062 {
10063 puDst->au16[0] = uSrc1.au16[0] << uShift;
10064 puDst->au16[1] = uSrc1.au16[1] << uShift;
10065 puDst->au16[2] = uSrc1.au16[2] << uShift;
10066 puDst->au16[3] = uSrc1.au16[3] << uShift;
10067 puDst->au16[4] = uSrc1.au16[4] << uShift;
10068 puDst->au16[5] = uSrc1.au16[5] << uShift;
10069 puDst->au16[6] = uSrc1.au16[6] << uShift;
10070 puDst->au16[7] = uSrc1.au16[7] << uShift;
10071 }
10072 else
10073 {
10074 puDst->au64[0] = 0;
10075 puDst->au64[1] = 0;
10076 }
10077}
10078
10079#endif
10080
10081
10082/*
10083 * PSRLD / VPSRLD
10084 */
10085#ifdef IEM_WITHOUT_ASSEMBLY
10086
10087IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10088{
10089 RTUINT64U uSrc1 = { *puDst };
10090 RTUINT64U uSrc2 = { *puSrc };
10091 RTUINT64U uDst;
10092
10093 if (uSrc2.au64[0] <= 31)
10094 {
10095 uDst.au32[0] = uSrc1.au32[0] >> uSrc2.au8[0];
10096 uDst.au32[1] = uSrc1.au32[1] >> uSrc2.au8[0];
10097 }
10098 else
10099 {
10100 uDst.au64[0] = 0;
10101 }
10102 *puDst = uDst.u;
10103}
10104
10105
10106IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10107{
10108 RTUINT64U uSrc1 = { *puDst };
10109 RTUINT64U uDst;
10110
10111 if (uShift <= 31)
10112 {
10113 uDst.au32[0] = uSrc1.au32[0] >> uShift;
10114 uDst.au32[1] = uSrc1.au32[1] >> uShift;
10115 }
10116 else
10117 {
10118 uDst.au64[0] = 0;
10119 }
10120 *puDst = uDst.u;
10121}
10122
10123
10124IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10125{
10126 RTUINT128U uSrc1 = *puDst;
10127
10128 if (puSrc->au64[0] <= 31)
10129 {
10130 puDst->au32[0] = uSrc1.au32[0] >> puSrc->au8[0];
10131 puDst->au32[1] = uSrc1.au32[1] >> puSrc->au8[0];
10132 puDst->au32[2] = uSrc1.au32[2] >> puSrc->au8[0];
10133 puDst->au32[3] = uSrc1.au32[3] >> puSrc->au8[0];
10134 }
10135 else
10136 {
10137 puDst->au64[0] = 0;
10138 puDst->au64[1] = 0;
10139 }
10140}
10141
10142IEM_DECL_IMPL_DEF(void, iemAImpl_psrld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10143{
10144 RTUINT128U uSrc1 = *puDst;
10145
10146 if (uShift <= 31)
10147 {
10148 puDst->au32[0] = uSrc1.au32[0] >> uShift;
10149 puDst->au32[1] = uSrc1.au32[1] >> uShift;
10150 puDst->au32[2] = uSrc1.au32[2] >> uShift;
10151 puDst->au32[3] = uSrc1.au32[3] >> uShift;
10152 }
10153 else
10154 {
10155 puDst->au64[0] = 0;
10156 puDst->au64[1] = 0;
10157 }
10158}
10159
10160#endif
10161
10162
10163/*
10164 * PSRAD / VPSRAD
10165 */
10166#ifdef IEM_WITHOUT_ASSEMBLY
10167
10168IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u64,(uint64_t *puDst, uint64_t const *puSrc))
10169{
10170 RTUINT64U uSrc1 = { *puDst };
10171 RTUINT64U uSrc2 = { *puSrc };
10172 RTUINT64U uDst;
10173
10174 if (uSrc2.au64[0] <= 31)
10175 {
10176 uDst.ai32[0] = uSrc1.ai32[0] >> uSrc2.au8[0];
10177 uDst.ai32[1] = uSrc1.ai32[1] >> uSrc2.au8[0];
10178 }
10179 else
10180 {
10181 uDst.au64[0] = 0;
10182 }
10183 *puDst = uDst.u;
10184}
10185
10186
10187IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u64,(uint64_t *puDst, uint8_t uShift))
10188{
10189 RTUINT64U uSrc1 = { *puDst };
10190 RTUINT64U uDst;
10191
10192 if (uShift <= 31)
10193 {
10194 uDst.ai32[0] = uSrc1.ai32[0] >> uShift;
10195 uDst.ai32[1] = uSrc1.ai32[1] >> uShift;
10196 }
10197 else
10198 {
10199 uDst.au64[0] = 0;
10200 }
10201 *puDst = uDst.u;
10202}
10203
10204
10205IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10206{
10207 RTUINT128U uSrc1 = *puDst;
10208
10209 if (puSrc->au64[0] <= 31)
10210 {
10211 puDst->ai32[0] = uSrc1.ai32[0] >> puSrc->au8[0];
10212 puDst->ai32[1] = uSrc1.ai32[1] >> puSrc->au8[0];
10213 puDst->ai32[2] = uSrc1.ai32[2] >> puSrc->au8[0];
10214 puDst->ai32[3] = uSrc1.ai32[3] >> puSrc->au8[0];
10215 }
10216 else
10217 {
10218 puDst->au64[0] = 0;
10219 puDst->au64[1] = 0;
10220 }
10221}
10222
10223IEM_DECL_IMPL_DEF(void, iemAImpl_psrad_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10224{
10225 RTUINT128U uSrc1 = *puDst;
10226
10227 if (uShift <= 31)
10228 {
10229 puDst->ai32[0] = uSrc1.ai32[0] >> uShift;
10230 puDst->ai32[1] = uSrc1.ai32[1] >> uShift;
10231 puDst->ai32[2] = uSrc1.ai32[2] >> uShift;
10232 puDst->ai32[3] = uSrc1.ai32[3] >> uShift;
10233 }
10234 else
10235 {
10236 puDst->au64[0] = 0;
10237 puDst->au64[1] = 0;
10238 }
10239}
10240
10241#endif
10242
10243
10244/*
10245 * PSLLD / VPSLLD
10246 */
10247#ifdef IEM_WITHOUT_ASSEMBLY
10248
10249IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u64,(uint64_t *puDst, uint64_t const *puSrc))
10250{
10251 RTUINT64U uSrc1 = { *puDst };
10252 RTUINT64U uSrc2 = { *puSrc };
10253 RTUINT64U uDst;
10254
10255 if (uSrc2.au64[0] <= 31)
10256 {
10257 uDst.au32[0] = uSrc1.au32[0] << uSrc2.au8[0];
10258 uDst.au32[1] = uSrc1.au32[1] << uSrc2.au8[0];
10259 }
10260 else
10261 {
10262 uDst.au64[0] = 0;
10263 }
10264 *puDst = uDst.u;
10265}
10266
10267
10268IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u64,(uint64_t *puDst, uint8_t uShift))
10269{
10270 RTUINT64U uSrc1 = { *puDst };
10271 RTUINT64U uDst;
10272
10273 if (uShift <= 31)
10274 {
10275 uDst.au32[0] = uSrc1.au32[0] << uShift;
10276 uDst.au32[1] = uSrc1.au32[1] << uShift;
10277 }
10278 else
10279 {
10280 uDst.au64[0] = 0;
10281 }
10282 *puDst = uDst.u;
10283}
10284
10285
10286IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10287{
10288 RTUINT128U uSrc1 = *puDst;
10289
10290 if (puSrc->au64[0] <= 31)
10291 {
10292 puDst->au32[0] = uSrc1.au32[0] << puSrc->au8[0];
10293 puDst->au32[1] = uSrc1.au32[1] << puSrc->au8[0];
10294 puDst->au32[2] = uSrc1.au32[2] << puSrc->au8[0];
10295 puDst->au32[3] = uSrc1.au32[3] << puSrc->au8[0];
10296 }
10297 else
10298 {
10299 puDst->au64[0] = 0;
10300 puDst->au64[1] = 0;
10301 }
10302}
10303
10304IEM_DECL_IMPL_DEF(void, iemAImpl_pslld_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10305{
10306 RTUINT128U uSrc1 = *puDst;
10307
10308 if (uShift <= 31)
10309 {
10310 puDst->au32[0] = uSrc1.au32[0] << uShift;
10311 puDst->au32[1] = uSrc1.au32[1] << uShift;
10312 puDst->au32[2] = uSrc1.au32[2] << uShift;
10313 puDst->au32[3] = uSrc1.au32[3] << uShift;
10314 }
10315 else
10316 {
10317 puDst->au64[0] = 0;
10318 puDst->au64[1] = 0;
10319 }
10320}
10321
10322#endif
10323
10324
10325/*
10326 * PSRLQ / VPSRLQ
10327 */
10328#ifdef IEM_WITHOUT_ASSEMBLY
10329
10330IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10331{
10332 RTUINT64U uSrc1 = { *puDst };
10333 RTUINT64U uSrc2 = { *puSrc };
10334 RTUINT64U uDst;
10335
10336 if (uSrc2.au64[0] <= 63)
10337 {
10338 uDst.au64[0] = uSrc1.au64[0] >> uSrc2.au8[0];
10339 }
10340 else
10341 {
10342 uDst.au64[0] = 0;
10343 }
10344 *puDst = uDst.u;
10345}
10346
10347
10348IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10349{
10350 RTUINT64U uSrc1 = { *puDst };
10351 RTUINT64U uDst;
10352
10353 if (uShift <= 63)
10354 {
10355 uDst.au64[0] = uSrc1.au64[0] >> uShift;
10356 }
10357 else
10358 {
10359 uDst.au64[0] = 0;
10360 }
10361 *puDst = uDst.u;
10362}
10363
10364
10365IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10366{
10367 RTUINT128U uSrc1 = *puDst;
10368
10369 if (puSrc->au64[0] <= 63)
10370 {
10371 puDst->au64[0] = uSrc1.au64[0] >> puSrc->au8[0];
10372 puDst->au64[1] = uSrc1.au64[1] >> puSrc->au8[0];
10373 }
10374 else
10375 {
10376 puDst->au64[0] = 0;
10377 puDst->au64[1] = 0;
10378 }
10379}
10380
10381IEM_DECL_IMPL_DEF(void, iemAImpl_psrlq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10382{
10383 RTUINT128U uSrc1 = *puDst;
10384
10385 if (uShift <= 63)
10386 {
10387 puDst->au64[0] = uSrc1.au64[0] >> uShift;
10388 puDst->au64[1] = uSrc1.au64[1] >> uShift;
10389 }
10390 else
10391 {
10392 puDst->au64[0] = 0;
10393 puDst->au64[1] = 0;
10394 }
10395}
10396
10397#endif
10398
10399
10400/*
10401 * PSLLQ / VPSLLQ
10402 */
10403#ifdef IEM_WITHOUT_ASSEMBLY
10404
10405IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u64,(uint64_t *puDst, uint64_t const *puSrc))
10406{
10407 RTUINT64U uSrc1 = { *puDst };
10408 RTUINT64U uSrc2 = { *puSrc };
10409 RTUINT64U uDst;
10410
10411 if (uSrc2.au64[0] <= 63)
10412 {
10413 uDst.au64[0] = uSrc1.au64[0] << uSrc2.au8[0];
10414 }
10415 else
10416 {
10417 uDst.au64[0] = 0;
10418 }
10419 *puDst = uDst.u;
10420}
10421
10422
10423IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u64,(uint64_t *puDst, uint8_t uShift))
10424{
10425 RTUINT64U uSrc1 = { *puDst };
10426 RTUINT64U uDst;
10427
10428 if (uShift <= 63)
10429 {
10430 uDst.au64[0] = uSrc1.au64[0] << uShift;
10431 }
10432 else
10433 {
10434 uDst.au64[0] = 0;
10435 }
10436 *puDst = uDst.u;
10437}
10438
10439
10440IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
10441{
10442 RTUINT128U uSrc1 = *puDst;
10443
10444 if (puSrc->au64[0] <= 63)
10445 {
10446 puDst->au64[0] = uSrc1.au64[0] << puSrc->au8[0];
10447 puDst->au64[1] = uSrc1.au64[1] << puSrc->au8[0];
10448 }
10449 else
10450 {
10451 puDst->au64[0] = 0;
10452 puDst->au64[1] = 0;
10453 }
10454}
10455
10456IEM_DECL_IMPL_DEF(void, iemAImpl_psllq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10457{
10458 RTUINT128U uSrc1 = *puDst;
10459
10460 if (uShift <= 63)
10461 {
10462 puDst->au64[0] = uSrc1.au64[0] << uShift;
10463 puDst->au64[1] = uSrc1.au64[1] << uShift;
10464 }
10465 else
10466 {
10467 puDst->au64[0] = 0;
10468 puDst->au64[1] = 0;
10469 }
10470}
10471
10472#endif
10473
10474
10475/*
10476 * PSRLDQ / VPSRLDQ
10477 */
10478#ifdef IEM_WITHOUT_ASSEMBLY
10479
10480IEM_DECL_IMPL_DEF(void, iemAImpl_psrldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10481{
10482 RTUINT128U uSrc1 = *puDst;
10483
10484 if (uShift < 16)
10485 {
10486 int i;
10487
10488 for (i = 0; i < 16 - uShift; ++i)
10489 puDst->au8[i] = uSrc1.au8[i + uShift];
10490 for (i = 16 - uShift; i < 16; ++i)
10491 puDst->au8[i] = 0;
10492 }
10493 else
10494 {
10495 puDst->au64[0] = 0;
10496 puDst->au64[1] = 0;
10497 }
10498}
10499
10500#endif
10501
10502
10503/*
10504 * PSLLDQ / VPSLLDQ
10505 */
10506#ifdef IEM_WITHOUT_ASSEMBLY
10507
10508IEM_DECL_IMPL_DEF(void, iemAImpl_pslldq_imm_u128,(PRTUINT128U puDst, uint8_t uShift))
10509{
10510 RTUINT128U uSrc1 = *puDst;
10511
10512 if (uShift < 16)
10513 {
10514 int i;
10515
10516 for (i = 0; i < uShift; ++i)
10517 puDst->au8[i] = 0;
10518 for (i = uShift; i < 16; ++i)
10519 puDst->au8[i] = uSrc1.au8[i - uShift];
10520 }
10521 else
10522 {
10523 puDst->au64[0] = 0;
10524 puDst->au64[1] = 0;
10525 }
10526}
10527
10528#endif
10529
10530
10531/*
10532 * PMADDWD / VPMADDWD
10533 */
10534#ifdef IEM_WITHOUT_ASSEMBLY
10535
10536IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10537{
10538 RTUINT64U uSrc1 = { *puDst };
10539 RTUINT64U uSrc2 = { *puSrc };
10540 RTUINT64U uDst;
10541
10542 uDst.ai32[0] = (int32_t)uSrc1.ai16[0] * uSrc2.ai16[0] + (int32_t)uSrc1.ai16[1] * uSrc2.ai16[1];
10543 uDst.ai32[1] = (int32_t)uSrc1.ai16[2] * uSrc2.ai16[2] + (int32_t)uSrc1.ai16[3] * uSrc2.ai16[3];
10544 *puDst = uDst.u;
10545 RT_NOREF(pFpuState);
10546}
10547
10548
10549IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddwd_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10550{
10551 RTUINT128U uSrc1 = *puDst;
10552
10553 puDst->ai32[0] = (int32_t)uSrc1.ai16[0] * puSrc->ai16[0] + (int32_t)uSrc1.ai16[1] * puSrc->ai16[1];
10554 puDst->ai32[1] = (int32_t)uSrc1.ai16[2] * puSrc->ai16[2] + (int32_t)uSrc1.ai16[3] * puSrc->ai16[3];
10555 puDst->ai32[2] = (int32_t)uSrc1.ai16[4] * puSrc->ai16[4] + (int32_t)uSrc1.ai16[5] * puSrc->ai16[5];
10556 puDst->ai32[3] = (int32_t)uSrc1.ai16[6] * puSrc->ai16[6] + (int32_t)uSrc1.ai16[7] * puSrc->ai16[7];
10557 RT_NOREF(pFpuState);
10558}
10559
10560#endif
10561
10562
10563/*
10564 * PMAXUB / VPMAXUB / PMAXUW / VPMAXUW / PMAXUD / VPMAXUD
10565 */
10566#ifdef IEM_WITHOUT_ASSEMBLY
10567
10568IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10569{
10570 RTUINT64U uSrc1 = { *puDst };
10571 RTUINT64U uSrc2 = { *puSrc };
10572 RTUINT64U uDst;
10573
10574 uDst.au8[0] = RT_MAX(uSrc1.au8[0], uSrc2.au8[0]);
10575 uDst.au8[1] = RT_MAX(uSrc1.au8[1], uSrc2.au8[1]);
10576 uDst.au8[2] = RT_MAX(uSrc1.au8[2], uSrc2.au8[2]);
10577 uDst.au8[3] = RT_MAX(uSrc1.au8[3], uSrc2.au8[3]);
10578 uDst.au8[4] = RT_MAX(uSrc1.au8[4], uSrc2.au8[4]);
10579 uDst.au8[5] = RT_MAX(uSrc1.au8[5], uSrc2.au8[5]);
10580 uDst.au8[6] = RT_MAX(uSrc1.au8[6], uSrc2.au8[6]);
10581 uDst.au8[7] = RT_MAX(uSrc1.au8[7], uSrc2.au8[7]);
10582 *puDst = uDst.u;
10583 RT_NOREF(pFpuState);
10584}
10585
10586
10587IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10588{
10589 RTUINT128U uSrc1 = *puDst;
10590
10591 puDst->au8[ 0] = RT_MAX(uSrc1.au8[ 0], puSrc->au8[ 0]);
10592 puDst->au8[ 1] = RT_MAX(uSrc1.au8[ 1], puSrc->au8[ 1]);
10593 puDst->au8[ 2] = RT_MAX(uSrc1.au8[ 2], puSrc->au8[ 2]);
10594 puDst->au8[ 3] = RT_MAX(uSrc1.au8[ 3], puSrc->au8[ 3]);
10595 puDst->au8[ 4] = RT_MAX(uSrc1.au8[ 4], puSrc->au8[ 4]);
10596 puDst->au8[ 5] = RT_MAX(uSrc1.au8[ 5], puSrc->au8[ 5]);
10597 puDst->au8[ 6] = RT_MAX(uSrc1.au8[ 6], puSrc->au8[ 6]);
10598 puDst->au8[ 7] = RT_MAX(uSrc1.au8[ 7], puSrc->au8[ 7]);
10599 puDst->au8[ 8] = RT_MAX(uSrc1.au8[ 8], puSrc->au8[ 8]);
10600 puDst->au8[ 9] = RT_MAX(uSrc1.au8[ 9], puSrc->au8[ 9]);
10601 puDst->au8[10] = RT_MAX(uSrc1.au8[10], puSrc->au8[10]);
10602 puDst->au8[11] = RT_MAX(uSrc1.au8[11], puSrc->au8[11]);
10603 puDst->au8[12] = RT_MAX(uSrc1.au8[12], puSrc->au8[12]);
10604 puDst->au8[13] = RT_MAX(uSrc1.au8[13], puSrc->au8[13]);
10605 puDst->au8[14] = RT_MAX(uSrc1.au8[14], puSrc->au8[14]);
10606 puDst->au8[15] = RT_MAX(uSrc1.au8[15], puSrc->au8[15]);
10607 RT_NOREF(pFpuState);
10608}
10609
10610#endif
10611
10612
10613IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10614{
10615 RTUINT128U uSrc1 = *puDst;
10616
10617 puDst->au16[ 0] = RT_MAX(uSrc1.au16[ 0], puSrc->au16[ 0]);
10618 puDst->au16[ 1] = RT_MAX(uSrc1.au16[ 1], puSrc->au16[ 1]);
10619 puDst->au16[ 2] = RT_MAX(uSrc1.au16[ 2], puSrc->au16[ 2]);
10620 puDst->au16[ 3] = RT_MAX(uSrc1.au16[ 3], puSrc->au16[ 3]);
10621 puDst->au16[ 4] = RT_MAX(uSrc1.au16[ 4], puSrc->au16[ 4]);
10622 puDst->au16[ 5] = RT_MAX(uSrc1.au16[ 5], puSrc->au16[ 5]);
10623 puDst->au16[ 6] = RT_MAX(uSrc1.au16[ 6], puSrc->au16[ 6]);
10624 puDst->au16[ 7] = RT_MAX(uSrc1.au16[ 7], puSrc->au16[ 7]);
10625 RT_NOREF(pFpuState);
10626}
10627
10628
10629IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10630{
10631 RTUINT128U uSrc1 = *puDst;
10632
10633 puDst->au32[ 0] = RT_MAX(uSrc1.au32[ 0], puSrc->au32[ 0]);
10634 puDst->au32[ 1] = RT_MAX(uSrc1.au32[ 1], puSrc->au32[ 1]);
10635 puDst->au32[ 2] = RT_MAX(uSrc1.au32[ 2], puSrc->au32[ 2]);
10636 puDst->au32[ 3] = RT_MAX(uSrc1.au32[ 3], puSrc->au32[ 3]);
10637 RT_NOREF(pFpuState);
10638}
10639
10640
10641IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10642 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10643{
10644 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10645 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10646 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10647 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10648 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10649 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10650 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10651 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10652 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10653 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10654 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10655 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10656 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10657 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10658 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10659 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10660 RT_NOREF(pExtState);
10661}
10662
10663
10664IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10665 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10666{
10667 puDst->au8[ 0] = RT_MAX(puSrc1->au8[ 0], puSrc2->au8[ 0]);
10668 puDst->au8[ 1] = RT_MAX(puSrc1->au8[ 1], puSrc2->au8[ 1]);
10669 puDst->au8[ 2] = RT_MAX(puSrc1->au8[ 2], puSrc2->au8[ 2]);
10670 puDst->au8[ 3] = RT_MAX(puSrc1->au8[ 3], puSrc2->au8[ 3]);
10671 puDst->au8[ 4] = RT_MAX(puSrc1->au8[ 4], puSrc2->au8[ 4]);
10672 puDst->au8[ 5] = RT_MAX(puSrc1->au8[ 5], puSrc2->au8[ 5]);
10673 puDst->au8[ 6] = RT_MAX(puSrc1->au8[ 6], puSrc2->au8[ 6]);
10674 puDst->au8[ 7] = RT_MAX(puSrc1->au8[ 7], puSrc2->au8[ 7]);
10675 puDst->au8[ 8] = RT_MAX(puSrc1->au8[ 8], puSrc2->au8[ 8]);
10676 puDst->au8[ 9] = RT_MAX(puSrc1->au8[ 9], puSrc2->au8[ 9]);
10677 puDst->au8[10] = RT_MAX(puSrc1->au8[10], puSrc2->au8[10]);
10678 puDst->au8[11] = RT_MAX(puSrc1->au8[11], puSrc2->au8[11]);
10679 puDst->au8[12] = RT_MAX(puSrc1->au8[12], puSrc2->au8[12]);
10680 puDst->au8[13] = RT_MAX(puSrc1->au8[13], puSrc2->au8[13]);
10681 puDst->au8[14] = RT_MAX(puSrc1->au8[14], puSrc2->au8[14]);
10682 puDst->au8[15] = RT_MAX(puSrc1->au8[15], puSrc2->au8[15]);
10683 puDst->au8[16] = RT_MAX(puSrc1->au8[16], puSrc2->au8[16]);
10684 puDst->au8[17] = RT_MAX(puSrc1->au8[17], puSrc2->au8[17]);
10685 puDst->au8[18] = RT_MAX(puSrc1->au8[18], puSrc2->au8[18]);
10686 puDst->au8[19] = RT_MAX(puSrc1->au8[19], puSrc2->au8[19]);
10687 puDst->au8[20] = RT_MAX(puSrc1->au8[20], puSrc2->au8[20]);
10688 puDst->au8[21] = RT_MAX(puSrc1->au8[21], puSrc2->au8[21]);
10689 puDst->au8[22] = RT_MAX(puSrc1->au8[22], puSrc2->au8[22]);
10690 puDst->au8[23] = RT_MAX(puSrc1->au8[23], puSrc2->au8[23]);
10691 puDst->au8[24] = RT_MAX(puSrc1->au8[24], puSrc2->au8[24]);
10692 puDst->au8[25] = RT_MAX(puSrc1->au8[25], puSrc2->au8[25]);
10693 puDst->au8[26] = RT_MAX(puSrc1->au8[26], puSrc2->au8[26]);
10694 puDst->au8[27] = RT_MAX(puSrc1->au8[27], puSrc2->au8[27]);
10695 puDst->au8[28] = RT_MAX(puSrc1->au8[28], puSrc2->au8[28]);
10696 puDst->au8[29] = RT_MAX(puSrc1->au8[29], puSrc2->au8[29]);
10697 puDst->au8[30] = RT_MAX(puSrc1->au8[30], puSrc2->au8[30]);
10698 puDst->au8[31] = RT_MAX(puSrc1->au8[31], puSrc2->au8[31]);
10699 RT_NOREF(pExtState);
10700}
10701
10702
10703IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10704 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10705{
10706 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10707 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10708 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10709 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10710 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10711 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10712 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10713 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10714 RT_NOREF(pExtState);
10715}
10716
10717
10718IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10719 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10720{
10721 puDst->au16[ 0] = RT_MAX(puSrc1->au16[ 0], puSrc2->au16[ 0]);
10722 puDst->au16[ 1] = RT_MAX(puSrc1->au16[ 1], puSrc2->au16[ 1]);
10723 puDst->au16[ 2] = RT_MAX(puSrc1->au16[ 2], puSrc2->au16[ 2]);
10724 puDst->au16[ 3] = RT_MAX(puSrc1->au16[ 3], puSrc2->au16[ 3]);
10725 puDst->au16[ 4] = RT_MAX(puSrc1->au16[ 4], puSrc2->au16[ 4]);
10726 puDst->au16[ 5] = RT_MAX(puSrc1->au16[ 5], puSrc2->au16[ 5]);
10727 puDst->au16[ 6] = RT_MAX(puSrc1->au16[ 6], puSrc2->au16[ 6]);
10728 puDst->au16[ 7] = RT_MAX(puSrc1->au16[ 7], puSrc2->au16[ 7]);
10729 puDst->au16[ 8] = RT_MAX(puSrc1->au16[ 8], puSrc2->au16[ 8]);
10730 puDst->au16[ 9] = RT_MAX(puSrc1->au16[ 9], puSrc2->au16[ 9]);
10731 puDst->au16[10] = RT_MAX(puSrc1->au16[10], puSrc2->au16[10]);
10732 puDst->au16[11] = RT_MAX(puSrc1->au16[11], puSrc2->au16[11]);
10733 puDst->au16[12] = RT_MAX(puSrc1->au16[12], puSrc2->au16[12]);
10734 puDst->au16[13] = RT_MAX(puSrc1->au16[13], puSrc2->au16[13]);
10735 puDst->au16[14] = RT_MAX(puSrc1->au16[14], puSrc2->au16[14]);
10736 puDst->au16[15] = RT_MAX(puSrc1->au16[15], puSrc2->au16[15]);
10737 RT_NOREF(pExtState);
10738}
10739
10740
10741IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10742 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10743{
10744 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10745 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10746 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10747 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10748 RT_NOREF(pExtState);
10749}
10750
10751
10752IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10753 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10754{
10755 puDst->au32[ 0] = RT_MAX(puSrc1->au32[ 0], puSrc2->au32[ 0]);
10756 puDst->au32[ 1] = RT_MAX(puSrc1->au32[ 1], puSrc2->au32[ 1]);
10757 puDst->au32[ 2] = RT_MAX(puSrc1->au32[ 2], puSrc2->au32[ 2]);
10758 puDst->au32[ 3] = RT_MAX(puSrc1->au32[ 3], puSrc2->au32[ 3]);
10759 puDst->au32[ 4] = RT_MAX(puSrc1->au32[ 4], puSrc2->au32[ 4]);
10760 puDst->au32[ 5] = RT_MAX(puSrc1->au32[ 5], puSrc2->au32[ 5]);
10761 puDst->au32[ 6] = RT_MAX(puSrc1->au32[ 6], puSrc2->au32[ 6]);
10762 puDst->au32[ 7] = RT_MAX(puSrc1->au32[ 7], puSrc2->au32[ 7]);
10763 RT_NOREF(pExtState);
10764}
10765
10766
10767/*
10768 * PMAXSB / VPMAXSB / PMAXSW / VPMAXSW / PMAXSD / VPMAXSD
10769 */
10770#ifdef IEM_WITHOUT_ASSEMBLY
10771
10772IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10773{
10774 RTUINT64U uSrc1 = { *puDst };
10775 RTUINT64U uSrc2 = { *puSrc };
10776 RTUINT64U uDst;
10777
10778 uDst.ai16[0] = RT_MAX(uSrc1.ai16[0], uSrc2.ai16[0]);
10779 uDst.ai16[1] = RT_MAX(uSrc1.ai16[1], uSrc2.ai16[1]);
10780 uDst.ai16[2] = RT_MAX(uSrc1.ai16[2], uSrc2.ai16[2]);
10781 uDst.ai16[3] = RT_MAX(uSrc1.ai16[3], uSrc2.ai16[3]);
10782 *puDst = uDst.u;
10783 RT_NOREF(pFpuState);
10784}
10785
10786
10787IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10788{
10789 RTUINT128U uSrc1 = *puDst;
10790
10791 puDst->ai16[ 0] = RT_MAX(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
10792 puDst->ai16[ 1] = RT_MAX(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
10793 puDst->ai16[ 2] = RT_MAX(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
10794 puDst->ai16[ 3] = RT_MAX(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
10795 puDst->ai16[ 4] = RT_MAX(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
10796 puDst->ai16[ 5] = RT_MAX(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
10797 puDst->ai16[ 6] = RT_MAX(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
10798 puDst->ai16[ 7] = RT_MAX(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
10799 RT_NOREF(pFpuState);
10800}
10801
10802#endif
10803
10804IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10805{
10806 RTUINT128U uSrc1 = *puDst;
10807
10808 puDst->ai8[ 0] = RT_MAX(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
10809 puDst->ai8[ 1] = RT_MAX(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
10810 puDst->ai8[ 2] = RT_MAX(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
10811 puDst->ai8[ 3] = RT_MAX(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
10812 puDst->ai8[ 4] = RT_MAX(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
10813 puDst->ai8[ 5] = RT_MAX(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
10814 puDst->ai8[ 6] = RT_MAX(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
10815 puDst->ai8[ 7] = RT_MAX(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
10816 puDst->ai8[ 8] = RT_MAX(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
10817 puDst->ai8[ 9] = RT_MAX(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
10818 puDst->ai8[10] = RT_MAX(uSrc1.ai8[10], puSrc->ai8[10]);
10819 puDst->ai8[11] = RT_MAX(uSrc1.ai8[11], puSrc->ai8[11]);
10820 puDst->ai8[12] = RT_MAX(uSrc1.ai8[12], puSrc->ai8[12]);
10821 puDst->ai8[13] = RT_MAX(uSrc1.ai8[13], puSrc->ai8[13]);
10822 puDst->ai8[14] = RT_MAX(uSrc1.ai8[14], puSrc->ai8[14]);
10823 puDst->ai8[15] = RT_MAX(uSrc1.ai8[15], puSrc->ai8[15]);
10824 RT_NOREF(pFpuState);
10825}
10826
10827
10828IEM_DECL_IMPL_DEF(void, iemAImpl_pmaxsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10829{
10830 RTUINT128U uSrc1 = *puDst;
10831
10832 puDst->ai32[ 0] = RT_MAX(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
10833 puDst->ai32[ 1] = RT_MAX(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
10834 puDst->ai32[ 2] = RT_MAX(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
10835 puDst->ai32[ 3] = RT_MAX(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
10836 RT_NOREF(pFpuState);
10837}
10838
10839
10840IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10841 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10842{
10843 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10844 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10845 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10846 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10847 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10848 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10849 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10850 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10851 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10852 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10853 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10854 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10855 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10856 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10857 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10858 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10859 RT_NOREF(pExtState);
10860}
10861
10862
10863IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10864 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10865{
10866 puDst->ai8[ 0] = RT_MAX(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
10867 puDst->ai8[ 1] = RT_MAX(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
10868 puDst->ai8[ 2] = RT_MAX(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
10869 puDst->ai8[ 3] = RT_MAX(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
10870 puDst->ai8[ 4] = RT_MAX(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
10871 puDst->ai8[ 5] = RT_MAX(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
10872 puDst->ai8[ 6] = RT_MAX(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
10873 puDst->ai8[ 7] = RT_MAX(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
10874 puDst->ai8[ 8] = RT_MAX(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
10875 puDst->ai8[ 9] = RT_MAX(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
10876 puDst->ai8[10] = RT_MAX(puSrc1->ai8[10], puSrc2->ai8[10]);
10877 puDst->ai8[11] = RT_MAX(puSrc1->ai8[11], puSrc2->ai8[11]);
10878 puDst->ai8[12] = RT_MAX(puSrc1->ai8[12], puSrc2->ai8[12]);
10879 puDst->ai8[13] = RT_MAX(puSrc1->ai8[13], puSrc2->ai8[13]);
10880 puDst->ai8[14] = RT_MAX(puSrc1->ai8[14], puSrc2->ai8[14]);
10881 puDst->ai8[15] = RT_MAX(puSrc1->ai8[15], puSrc2->ai8[15]);
10882 puDst->ai8[16] = RT_MAX(puSrc1->ai8[16], puSrc2->ai8[16]);
10883 puDst->ai8[17] = RT_MAX(puSrc1->ai8[17], puSrc2->ai8[17]);
10884 puDst->ai8[18] = RT_MAX(puSrc1->ai8[18], puSrc2->ai8[18]);
10885 puDst->ai8[19] = RT_MAX(puSrc1->ai8[19], puSrc2->ai8[19]);
10886 puDst->ai8[20] = RT_MAX(puSrc1->ai8[20], puSrc2->ai8[20]);
10887 puDst->ai8[21] = RT_MAX(puSrc1->ai8[21], puSrc2->ai8[21]);
10888 puDst->ai8[22] = RT_MAX(puSrc1->ai8[22], puSrc2->ai8[22]);
10889 puDst->ai8[23] = RT_MAX(puSrc1->ai8[23], puSrc2->ai8[23]);
10890 puDst->ai8[24] = RT_MAX(puSrc1->ai8[24], puSrc2->ai8[24]);
10891 puDst->ai8[25] = RT_MAX(puSrc1->ai8[25], puSrc2->ai8[25]);
10892 puDst->ai8[26] = RT_MAX(puSrc1->ai8[26], puSrc2->ai8[26]);
10893 puDst->ai8[27] = RT_MAX(puSrc1->ai8[27], puSrc2->ai8[27]);
10894 puDst->ai8[28] = RT_MAX(puSrc1->ai8[28], puSrc2->ai8[28]);
10895 puDst->ai8[29] = RT_MAX(puSrc1->ai8[29], puSrc2->ai8[29]);
10896 puDst->ai8[30] = RT_MAX(puSrc1->ai8[30], puSrc2->ai8[30]);
10897 puDst->ai8[31] = RT_MAX(puSrc1->ai8[31], puSrc2->ai8[31]);
10898 RT_NOREF(pExtState);
10899}
10900
10901
10902IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10903 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10904{
10905 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10906 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10907 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10908 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10909 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10910 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10911 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10912 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10913 RT_NOREF(pExtState);
10914}
10915
10916
10917IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10918 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10919{
10920 puDst->ai16[ 0] = RT_MAX(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
10921 puDst->ai16[ 1] = RT_MAX(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
10922 puDst->ai16[ 2] = RT_MAX(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
10923 puDst->ai16[ 3] = RT_MAX(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
10924 puDst->ai16[ 4] = RT_MAX(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
10925 puDst->ai16[ 5] = RT_MAX(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
10926 puDst->ai16[ 6] = RT_MAX(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
10927 puDst->ai16[ 7] = RT_MAX(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
10928 puDst->ai16[ 8] = RT_MAX(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
10929 puDst->ai16[ 9] = RT_MAX(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
10930 puDst->ai16[10] = RT_MAX(puSrc1->ai16[10], puSrc2->ai16[10]);
10931 puDst->ai16[11] = RT_MAX(puSrc1->ai16[11], puSrc2->ai16[11]);
10932 puDst->ai16[12] = RT_MAX(puSrc1->ai16[12], puSrc2->ai16[12]);
10933 puDst->ai16[13] = RT_MAX(puSrc1->ai16[13], puSrc2->ai16[13]);
10934 puDst->ai16[14] = RT_MAX(puSrc1->ai16[14], puSrc2->ai16[14]);
10935 puDst->ai16[15] = RT_MAX(puSrc1->ai16[15], puSrc2->ai16[15]);
10936 RT_NOREF(pExtState);
10937}
10938
10939
10940IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
10941 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
10942{
10943 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10944 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10945 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10946 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10947 RT_NOREF(pExtState);
10948}
10949
10950
10951IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaxsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
10952 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
10953{
10954 puDst->ai32[ 0] = RT_MAX(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
10955 puDst->ai32[ 1] = RT_MAX(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
10956 puDst->ai32[ 2] = RT_MAX(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
10957 puDst->ai32[ 3] = RT_MAX(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
10958 puDst->ai32[ 4] = RT_MAX(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
10959 puDst->ai32[ 5] = RT_MAX(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
10960 puDst->ai32[ 6] = RT_MAX(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
10961 puDst->ai32[ 7] = RT_MAX(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
10962 RT_NOREF(pExtState);
10963}
10964
10965
10966/*
10967 * PMINUB / VPMINUB / PMINUW / VPMINUW / PMINUD / VPMINUD
10968 */
10969#ifdef IEM_WITHOUT_ASSEMBLY
10970
10971IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
10972{
10973 RTUINT64U uSrc1 = { *puDst };
10974 RTUINT64U uSrc2 = { *puSrc };
10975 RTUINT64U uDst;
10976
10977 uDst.au8[0] = RT_MIN(uSrc1.au8[0], uSrc2.au8[0]);
10978 uDst.au8[1] = RT_MIN(uSrc1.au8[1], uSrc2.au8[1]);
10979 uDst.au8[2] = RT_MIN(uSrc1.au8[2], uSrc2.au8[2]);
10980 uDst.au8[3] = RT_MIN(uSrc1.au8[3], uSrc2.au8[3]);
10981 uDst.au8[4] = RT_MIN(uSrc1.au8[4], uSrc2.au8[4]);
10982 uDst.au8[5] = RT_MIN(uSrc1.au8[5], uSrc2.au8[5]);
10983 uDst.au8[6] = RT_MIN(uSrc1.au8[6], uSrc2.au8[6]);
10984 uDst.au8[7] = RT_MIN(uSrc1.au8[7], uSrc2.au8[7]);
10985 *puDst = uDst.u;
10986 RT_NOREF(pFpuState);
10987}
10988
10989
10990IEM_DECL_IMPL_DEF(void, iemAImpl_pminub_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
10991{
10992 RTUINT128U uSrc1 = *puDst;
10993
10994 puDst->au8[ 0] = RT_MIN(uSrc1.au8[ 0], puSrc->au8[ 0]);
10995 puDst->au8[ 1] = RT_MIN(uSrc1.au8[ 1], puSrc->au8[ 1]);
10996 puDst->au8[ 2] = RT_MIN(uSrc1.au8[ 2], puSrc->au8[ 2]);
10997 puDst->au8[ 3] = RT_MIN(uSrc1.au8[ 3], puSrc->au8[ 3]);
10998 puDst->au8[ 4] = RT_MIN(uSrc1.au8[ 4], puSrc->au8[ 4]);
10999 puDst->au8[ 5] = RT_MIN(uSrc1.au8[ 5], puSrc->au8[ 5]);
11000 puDst->au8[ 6] = RT_MIN(uSrc1.au8[ 6], puSrc->au8[ 6]);
11001 puDst->au8[ 7] = RT_MIN(uSrc1.au8[ 7], puSrc->au8[ 7]);
11002 puDst->au8[ 8] = RT_MIN(uSrc1.au8[ 8], puSrc->au8[ 8]);
11003 puDst->au8[ 9] = RT_MIN(uSrc1.au8[ 9], puSrc->au8[ 9]);
11004 puDst->au8[10] = RT_MIN(uSrc1.au8[10], puSrc->au8[10]);
11005 puDst->au8[11] = RT_MIN(uSrc1.au8[11], puSrc->au8[11]);
11006 puDst->au8[12] = RT_MIN(uSrc1.au8[12], puSrc->au8[12]);
11007 puDst->au8[13] = RT_MIN(uSrc1.au8[13], puSrc->au8[13]);
11008 puDst->au8[14] = RT_MIN(uSrc1.au8[14], puSrc->au8[14]);
11009 puDst->au8[15] = RT_MIN(uSrc1.au8[15], puSrc->au8[15]);
11010 RT_NOREF(pFpuState);
11011}
11012
11013#endif
11014
11015IEM_DECL_IMPL_DEF(void, iemAImpl_pminuw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11016{
11017 RTUINT128U uSrc1 = *puDst;
11018
11019 puDst->au16[ 0] = RT_MIN(uSrc1.au16[ 0], puSrc->au16[ 0]);
11020 puDst->au16[ 1] = RT_MIN(uSrc1.au16[ 1], puSrc->au16[ 1]);
11021 puDst->au16[ 2] = RT_MIN(uSrc1.au16[ 2], puSrc->au16[ 2]);
11022 puDst->au16[ 3] = RT_MIN(uSrc1.au16[ 3], puSrc->au16[ 3]);
11023 puDst->au16[ 4] = RT_MIN(uSrc1.au16[ 4], puSrc->au16[ 4]);
11024 puDst->au16[ 5] = RT_MIN(uSrc1.au16[ 5], puSrc->au16[ 5]);
11025 puDst->au16[ 6] = RT_MIN(uSrc1.au16[ 6], puSrc->au16[ 6]);
11026 puDst->au16[ 7] = RT_MIN(uSrc1.au16[ 7], puSrc->au16[ 7]);
11027 RT_NOREF(pFpuState);
11028}
11029
11030
11031IEM_DECL_IMPL_DEF(void, iemAImpl_pminud_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11032{
11033 RTUINT128U uSrc1 = *puDst;
11034
11035 puDst->au32[ 0] = RT_MIN(uSrc1.au32[ 0], puSrc->au32[ 0]);
11036 puDst->au32[ 1] = RT_MIN(uSrc1.au32[ 1], puSrc->au32[ 1]);
11037 puDst->au32[ 2] = RT_MIN(uSrc1.au32[ 2], puSrc->au32[ 2]);
11038 puDst->au32[ 3] = RT_MIN(uSrc1.au32[ 3], puSrc->au32[ 3]);
11039 RT_NOREF(pFpuState);
11040}
11041
11042
11043IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11044 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11045{
11046 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11047 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11048 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11049 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11050 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11051 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11052 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11053 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11054 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11055 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11056 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11057 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11058 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11059 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11060 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11061 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11062 RT_NOREF(pExtState);
11063}
11064
11065
11066IEM_DECL_IMPL_DEF(void, iemAImpl_vpminub_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11067 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11068{
11069 puDst->au8[ 0] = RT_MIN(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11070 puDst->au8[ 1] = RT_MIN(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11071 puDst->au8[ 2] = RT_MIN(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11072 puDst->au8[ 3] = RT_MIN(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11073 puDst->au8[ 4] = RT_MIN(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11074 puDst->au8[ 5] = RT_MIN(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11075 puDst->au8[ 6] = RT_MIN(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11076 puDst->au8[ 7] = RT_MIN(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11077 puDst->au8[ 8] = RT_MIN(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11078 puDst->au8[ 9] = RT_MIN(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11079 puDst->au8[10] = RT_MIN(puSrc1->au8[10], puSrc2->au8[10]);
11080 puDst->au8[11] = RT_MIN(puSrc1->au8[11], puSrc2->au8[11]);
11081 puDst->au8[12] = RT_MIN(puSrc1->au8[12], puSrc2->au8[12]);
11082 puDst->au8[13] = RT_MIN(puSrc1->au8[13], puSrc2->au8[13]);
11083 puDst->au8[14] = RT_MIN(puSrc1->au8[14], puSrc2->au8[14]);
11084 puDst->au8[15] = RT_MIN(puSrc1->au8[15], puSrc2->au8[15]);
11085 puDst->au8[16] = RT_MIN(puSrc1->au8[16], puSrc2->au8[16]);
11086 puDst->au8[17] = RT_MIN(puSrc1->au8[17], puSrc2->au8[17]);
11087 puDst->au8[18] = RT_MIN(puSrc1->au8[18], puSrc2->au8[18]);
11088 puDst->au8[19] = RT_MIN(puSrc1->au8[19], puSrc2->au8[19]);
11089 puDst->au8[20] = RT_MIN(puSrc1->au8[20], puSrc2->au8[20]);
11090 puDst->au8[21] = RT_MIN(puSrc1->au8[21], puSrc2->au8[21]);
11091 puDst->au8[22] = RT_MIN(puSrc1->au8[22], puSrc2->au8[22]);
11092 puDst->au8[23] = RT_MIN(puSrc1->au8[23], puSrc2->au8[23]);
11093 puDst->au8[24] = RT_MIN(puSrc1->au8[24], puSrc2->au8[24]);
11094 puDst->au8[25] = RT_MIN(puSrc1->au8[25], puSrc2->au8[25]);
11095 puDst->au8[26] = RT_MIN(puSrc1->au8[26], puSrc2->au8[26]);
11096 puDst->au8[27] = RT_MIN(puSrc1->au8[27], puSrc2->au8[27]);
11097 puDst->au8[28] = RT_MIN(puSrc1->au8[28], puSrc2->au8[28]);
11098 puDst->au8[29] = RT_MIN(puSrc1->au8[29], puSrc2->au8[29]);
11099 puDst->au8[30] = RT_MIN(puSrc1->au8[30], puSrc2->au8[30]);
11100 puDst->au8[31] = RT_MIN(puSrc1->au8[31], puSrc2->au8[31]);
11101 RT_NOREF(pExtState);
11102}
11103
11104
11105IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11106 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11107{
11108 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11109 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11110 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11111 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11112 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11113 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11114 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11115 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11116 RT_NOREF(pExtState);
11117}
11118
11119
11120IEM_DECL_IMPL_DEF(void, iemAImpl_vpminuw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11121 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11122{
11123 puDst->au16[ 0] = RT_MIN(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11124 puDst->au16[ 1] = RT_MIN(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11125 puDst->au16[ 2] = RT_MIN(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11126 puDst->au16[ 3] = RT_MIN(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11127 puDst->au16[ 4] = RT_MIN(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11128 puDst->au16[ 5] = RT_MIN(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11129 puDst->au16[ 6] = RT_MIN(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11130 puDst->au16[ 7] = RT_MIN(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11131 puDst->au16[ 8] = RT_MIN(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11132 puDst->au16[ 9] = RT_MIN(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11133 puDst->au16[10] = RT_MIN(puSrc1->au16[10], puSrc2->au16[10]);
11134 puDst->au16[11] = RT_MIN(puSrc1->au16[11], puSrc2->au16[11]);
11135 puDst->au16[12] = RT_MIN(puSrc1->au16[12], puSrc2->au16[12]);
11136 puDst->au16[13] = RT_MIN(puSrc1->au16[13], puSrc2->au16[13]);
11137 puDst->au16[14] = RT_MIN(puSrc1->au16[14], puSrc2->au16[14]);
11138 puDst->au16[15] = RT_MIN(puSrc1->au16[15], puSrc2->au16[15]);
11139 RT_NOREF(pExtState);
11140}
11141
11142
11143IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11144 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11145{
11146 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11147 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11148 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11149 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11150 RT_NOREF(pExtState);
11151}
11152
11153
11154IEM_DECL_IMPL_DEF(void, iemAImpl_vpminud_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11155 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11156{
11157 puDst->au32[ 0] = RT_MIN(puSrc1->au32[ 0], puSrc2->au32[ 0]);
11158 puDst->au32[ 1] = RT_MIN(puSrc1->au32[ 1], puSrc2->au32[ 1]);
11159 puDst->au32[ 2] = RT_MIN(puSrc1->au32[ 2], puSrc2->au32[ 2]);
11160 puDst->au32[ 3] = RT_MIN(puSrc1->au32[ 3], puSrc2->au32[ 3]);
11161 puDst->au32[ 4] = RT_MIN(puSrc1->au32[ 4], puSrc2->au32[ 4]);
11162 puDst->au32[ 5] = RT_MIN(puSrc1->au32[ 5], puSrc2->au32[ 5]);
11163 puDst->au32[ 6] = RT_MIN(puSrc1->au32[ 6], puSrc2->au32[ 6]);
11164 puDst->au32[ 7] = RT_MIN(puSrc1->au32[ 7], puSrc2->au32[ 7]);
11165 RT_NOREF(pExtState);
11166}
11167
11168
11169/*
11170 * PMINSB / VPMINSB / PMINSW / VPMINSW / PMINSD / VPMINSD
11171 */
11172#ifdef IEM_WITHOUT_ASSEMBLY
11173
11174IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11175{
11176 RTUINT64U uSrc1 = { *puDst };
11177 RTUINT64U uSrc2 = { *puSrc };
11178 RTUINT64U uDst;
11179
11180 uDst.ai16[0] = RT_MIN(uSrc1.ai16[0], uSrc2.ai16[0]);
11181 uDst.ai16[1] = RT_MIN(uSrc1.ai16[1], uSrc2.ai16[1]);
11182 uDst.ai16[2] = RT_MIN(uSrc1.ai16[2], uSrc2.ai16[2]);
11183 uDst.ai16[3] = RT_MIN(uSrc1.ai16[3], uSrc2.ai16[3]);
11184 *puDst = uDst.u;
11185 RT_NOREF(pFpuState);
11186}
11187
11188
11189IEM_DECL_IMPL_DEF(void, iemAImpl_pminsw_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11190{
11191 RTUINT128U uSrc1 = *puDst;
11192
11193 puDst->ai16[ 0] = RT_MIN(uSrc1.ai16[ 0], puSrc->ai16[ 0]);
11194 puDst->ai16[ 1] = RT_MIN(uSrc1.ai16[ 1], puSrc->ai16[ 1]);
11195 puDst->ai16[ 2] = RT_MIN(uSrc1.ai16[ 2], puSrc->ai16[ 2]);
11196 puDst->ai16[ 3] = RT_MIN(uSrc1.ai16[ 3], puSrc->ai16[ 3]);
11197 puDst->ai16[ 4] = RT_MIN(uSrc1.ai16[ 4], puSrc->ai16[ 4]);
11198 puDst->ai16[ 5] = RT_MIN(uSrc1.ai16[ 5], puSrc->ai16[ 5]);
11199 puDst->ai16[ 6] = RT_MIN(uSrc1.ai16[ 6], puSrc->ai16[ 6]);
11200 puDst->ai16[ 7] = RT_MIN(uSrc1.ai16[ 7], puSrc->ai16[ 7]);
11201 RT_NOREF(pFpuState);
11202}
11203
11204#endif
11205
11206IEM_DECL_IMPL_DEF(void, iemAImpl_pminsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11207{
11208 RTUINT128U uSrc1 = *puDst;
11209
11210 puDst->ai8[ 0] = RT_MIN(uSrc1.ai8[ 0], puSrc->ai8[ 0]);
11211 puDst->ai8[ 1] = RT_MIN(uSrc1.ai8[ 1], puSrc->ai8[ 1]);
11212 puDst->ai8[ 2] = RT_MIN(uSrc1.ai8[ 2], puSrc->ai8[ 2]);
11213 puDst->ai8[ 3] = RT_MIN(uSrc1.ai8[ 3], puSrc->ai8[ 3]);
11214 puDst->ai8[ 4] = RT_MIN(uSrc1.ai8[ 4], puSrc->ai8[ 4]);
11215 puDst->ai8[ 5] = RT_MIN(uSrc1.ai8[ 5], puSrc->ai8[ 5]);
11216 puDst->ai8[ 6] = RT_MIN(uSrc1.ai8[ 6], puSrc->ai8[ 6]);
11217 puDst->ai8[ 7] = RT_MIN(uSrc1.ai8[ 7], puSrc->ai8[ 7]);
11218 puDst->ai8[ 8] = RT_MIN(uSrc1.ai8[ 8], puSrc->ai8[ 8]);
11219 puDst->ai8[ 9] = RT_MIN(uSrc1.ai8[ 9], puSrc->ai8[ 9]);
11220 puDst->ai8[10] = RT_MIN(uSrc1.ai8[10], puSrc->ai8[10]);
11221 puDst->ai8[11] = RT_MIN(uSrc1.ai8[11], puSrc->ai8[11]);
11222 puDst->ai8[12] = RT_MIN(uSrc1.ai8[12], puSrc->ai8[12]);
11223 puDst->ai8[13] = RT_MIN(uSrc1.ai8[13], puSrc->ai8[13]);
11224 puDst->ai8[14] = RT_MIN(uSrc1.ai8[14], puSrc->ai8[14]);
11225 puDst->ai8[15] = RT_MIN(uSrc1.ai8[15], puSrc->ai8[15]);
11226 RT_NOREF(pFpuState);
11227}
11228
11229
11230IEM_DECL_IMPL_DEF(void, iemAImpl_pminsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11231{
11232 RTUINT128U uSrc1 = *puDst;
11233
11234 puDst->ai32[ 0] = RT_MIN(uSrc1.ai32[ 0], puSrc->ai32[ 0]);
11235 puDst->ai32[ 1] = RT_MIN(uSrc1.ai32[ 1], puSrc->ai32[ 1]);
11236 puDst->ai32[ 2] = RT_MIN(uSrc1.ai32[ 2], puSrc->ai32[ 2]);
11237 puDst->ai32[ 3] = RT_MIN(uSrc1.ai32[ 3], puSrc->ai32[ 3]);
11238 RT_NOREF(pFpuState);
11239}
11240
11241
11242IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11243 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11244{
11245 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11246 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11247 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11248 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11249 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11250 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11251 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11252 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11253 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11254 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11255 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11256 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11257 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11258 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11259 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11260 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11261 RT_NOREF(pExtState);
11262}
11263
11264
11265IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11266 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11267{
11268 puDst->ai8[ 0] = RT_MIN(puSrc1->ai8[ 0], puSrc2->ai8[ 0]);
11269 puDst->ai8[ 1] = RT_MIN(puSrc1->ai8[ 1], puSrc2->ai8[ 1]);
11270 puDst->ai8[ 2] = RT_MIN(puSrc1->ai8[ 2], puSrc2->ai8[ 2]);
11271 puDst->ai8[ 3] = RT_MIN(puSrc1->ai8[ 3], puSrc2->ai8[ 3]);
11272 puDst->ai8[ 4] = RT_MIN(puSrc1->ai8[ 4], puSrc2->ai8[ 4]);
11273 puDst->ai8[ 5] = RT_MIN(puSrc1->ai8[ 5], puSrc2->ai8[ 5]);
11274 puDst->ai8[ 6] = RT_MIN(puSrc1->ai8[ 6], puSrc2->ai8[ 6]);
11275 puDst->ai8[ 7] = RT_MIN(puSrc1->ai8[ 7], puSrc2->ai8[ 7]);
11276 puDst->ai8[ 8] = RT_MIN(puSrc1->ai8[ 8], puSrc2->ai8[ 8]);
11277 puDst->ai8[ 9] = RT_MIN(puSrc1->ai8[ 9], puSrc2->ai8[ 9]);
11278 puDst->ai8[10] = RT_MIN(puSrc1->ai8[10], puSrc2->ai8[10]);
11279 puDst->ai8[11] = RT_MIN(puSrc1->ai8[11], puSrc2->ai8[11]);
11280 puDst->ai8[12] = RT_MIN(puSrc1->ai8[12], puSrc2->ai8[12]);
11281 puDst->ai8[13] = RT_MIN(puSrc1->ai8[13], puSrc2->ai8[13]);
11282 puDst->ai8[14] = RT_MIN(puSrc1->ai8[14], puSrc2->ai8[14]);
11283 puDst->ai8[15] = RT_MIN(puSrc1->ai8[15], puSrc2->ai8[15]);
11284 puDst->ai8[16] = RT_MIN(puSrc1->ai8[16], puSrc2->ai8[16]);
11285 puDst->ai8[17] = RT_MIN(puSrc1->ai8[17], puSrc2->ai8[17]);
11286 puDst->ai8[18] = RT_MIN(puSrc1->ai8[18], puSrc2->ai8[18]);
11287 puDst->ai8[19] = RT_MIN(puSrc1->ai8[19], puSrc2->ai8[19]);
11288 puDst->ai8[20] = RT_MIN(puSrc1->ai8[20], puSrc2->ai8[20]);
11289 puDst->ai8[21] = RT_MIN(puSrc1->ai8[21], puSrc2->ai8[21]);
11290 puDst->ai8[22] = RT_MIN(puSrc1->ai8[22], puSrc2->ai8[22]);
11291 puDst->ai8[23] = RT_MIN(puSrc1->ai8[23], puSrc2->ai8[23]);
11292 puDst->ai8[24] = RT_MIN(puSrc1->ai8[24], puSrc2->ai8[24]);
11293 puDst->ai8[25] = RT_MIN(puSrc1->ai8[25], puSrc2->ai8[25]);
11294 puDst->ai8[26] = RT_MIN(puSrc1->ai8[26], puSrc2->ai8[26]);
11295 puDst->ai8[27] = RT_MIN(puSrc1->ai8[27], puSrc2->ai8[27]);
11296 puDst->ai8[28] = RT_MIN(puSrc1->ai8[28], puSrc2->ai8[28]);
11297 puDst->ai8[29] = RT_MIN(puSrc1->ai8[29], puSrc2->ai8[29]);
11298 puDst->ai8[30] = RT_MIN(puSrc1->ai8[30], puSrc2->ai8[30]);
11299 puDst->ai8[31] = RT_MIN(puSrc1->ai8[31], puSrc2->ai8[31]);
11300 RT_NOREF(pExtState);
11301}
11302
11303
11304IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11305 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11306{
11307 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11308 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11309 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11310 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11311 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11312 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11313 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11314 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11315 RT_NOREF(pExtState);
11316}
11317
11318
11319IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsw_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11320 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11321{
11322 puDst->ai16[ 0] = RT_MIN(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
11323 puDst->ai16[ 1] = RT_MIN(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
11324 puDst->ai16[ 2] = RT_MIN(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
11325 puDst->ai16[ 3] = RT_MIN(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
11326 puDst->ai16[ 4] = RT_MIN(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
11327 puDst->ai16[ 5] = RT_MIN(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
11328 puDst->ai16[ 6] = RT_MIN(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
11329 puDst->ai16[ 7] = RT_MIN(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
11330 puDst->ai16[ 8] = RT_MIN(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
11331 puDst->ai16[ 9] = RT_MIN(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
11332 puDst->ai16[10] = RT_MIN(puSrc1->ai16[10], puSrc2->ai16[10]);
11333 puDst->ai16[11] = RT_MIN(puSrc1->ai16[11], puSrc2->ai16[11]);
11334 puDst->ai16[12] = RT_MIN(puSrc1->ai16[12], puSrc2->ai16[12]);
11335 puDst->ai16[13] = RT_MIN(puSrc1->ai16[13], puSrc2->ai16[13]);
11336 puDst->ai16[14] = RT_MIN(puSrc1->ai16[14], puSrc2->ai16[14]);
11337 puDst->ai16[15] = RT_MIN(puSrc1->ai16[15], puSrc2->ai16[15]);
11338 RT_NOREF(pExtState);
11339}
11340
11341
11342IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11343 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11344{
11345 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11346 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11347 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11348 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11349 RT_NOREF(pExtState);
11350}
11351
11352
11353IEM_DECL_IMPL_DEF(void, iemAImpl_vpminsd_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11354 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11355{
11356 puDst->ai32[ 0] = RT_MIN(puSrc1->ai32[ 0], puSrc2->ai32[ 0]);
11357 puDst->ai32[ 1] = RT_MIN(puSrc1->ai32[ 1], puSrc2->ai32[ 1]);
11358 puDst->ai32[ 2] = RT_MIN(puSrc1->ai32[ 2], puSrc2->ai32[ 2]);
11359 puDst->ai32[ 3] = RT_MIN(puSrc1->ai32[ 3], puSrc2->ai32[ 3]);
11360 puDst->ai32[ 4] = RT_MIN(puSrc1->ai32[ 4], puSrc2->ai32[ 4]);
11361 puDst->ai32[ 5] = RT_MIN(puSrc1->ai32[ 5], puSrc2->ai32[ 5]);
11362 puDst->ai32[ 6] = RT_MIN(puSrc1->ai32[ 6], puSrc2->ai32[ 6]);
11363 puDst->ai32[ 7] = RT_MIN(puSrc1->ai32[ 7], puSrc2->ai32[ 7]);
11364 RT_NOREF(pExtState);
11365}
11366
11367
11368/*
11369 * PAVGB / VPAVGB / PAVGW / VPAVGW
11370 */
11371#define PAVGB_EXEC(a_Src1, a_Src2) ((uint8_t)(((uint16_t)(a_Src1) + (a_Src2) + 1) >> 1))
11372#define PAVGW_EXEC(a_Src1, a_Src2) ((uint16_t)(((uint32_t)(a_Src1) + (a_Src2) + 1) >> 1))
11373
11374#ifdef IEM_WITHOUT_ASSEMBLY
11375
11376IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u64,(uint64_t *puDst, uint64_t const *puSrc))
11377{
11378 RTUINT64U uSrc1 = { *puDst };
11379 RTUINT64U uSrc2 = { *puSrc };
11380 RTUINT64U uDst;
11381
11382 uDst.au8[0] = PAVGB_EXEC(uSrc1.au8[0], uSrc2.au8[0]);
11383 uDst.au8[1] = PAVGB_EXEC(uSrc1.au8[1], uSrc2.au8[1]);
11384 uDst.au8[2] = PAVGB_EXEC(uSrc1.au8[2], uSrc2.au8[2]);
11385 uDst.au8[3] = PAVGB_EXEC(uSrc1.au8[3], uSrc2.au8[3]);
11386 uDst.au8[4] = PAVGB_EXEC(uSrc1.au8[4], uSrc2.au8[4]);
11387 uDst.au8[5] = PAVGB_EXEC(uSrc1.au8[5], uSrc2.au8[5]);
11388 uDst.au8[6] = PAVGB_EXEC(uSrc1.au8[6], uSrc2.au8[6]);
11389 uDst.au8[7] = PAVGB_EXEC(uSrc1.au8[7], uSrc2.au8[7]);
11390 *puDst = uDst.u;
11391}
11392
11393
11394IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11395{
11396 RTUINT128U uSrc1 = *puDst;
11397
11398 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11399 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11400 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11401 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11402 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11403 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11404 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11405 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11406 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11407 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11408 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11409 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11410 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11411 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11412 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11413 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11414}
11415
11416
11417IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11418{
11419 RTUINT64U uSrc1 = { *puDst };
11420 RTUINT64U uSrc2 = { *puSrc };
11421 RTUINT64U uDst;
11422
11423 uDst.au16[0] = PAVGW_EXEC(uSrc1.au16[0], uSrc2.au16[0]);
11424 uDst.au16[1] = PAVGW_EXEC(uSrc1.au16[1], uSrc2.au16[1]);
11425 uDst.au16[2] = PAVGW_EXEC(uSrc1.au16[2], uSrc2.au16[2]);
11426 uDst.au16[3] = PAVGW_EXEC(uSrc1.au16[3], uSrc2.au16[3]);
11427 *puDst = uDst.u;
11428}
11429
11430
11431IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11432{
11433 RTUINT128U uSrc1 = *puDst;
11434
11435 puDst->au16[0] = PAVGW_EXEC(uSrc1.au16[0], puSrc->au16[0]);
11436 puDst->au16[1] = PAVGW_EXEC(uSrc1.au16[1], puSrc->au16[1]);
11437 puDst->au16[2] = PAVGW_EXEC(uSrc1.au16[2], puSrc->au16[2]);
11438 puDst->au16[3] = PAVGW_EXEC(uSrc1.au16[3], puSrc->au16[3]);
11439 puDst->au16[4] = PAVGW_EXEC(uSrc1.au16[4], puSrc->au16[4]);
11440 puDst->au16[5] = PAVGW_EXEC(uSrc1.au16[5], puSrc->au16[5]);
11441 puDst->au16[6] = PAVGW_EXEC(uSrc1.au16[6], puSrc->au16[6]);
11442 puDst->au16[7] = PAVGW_EXEC(uSrc1.au16[7], puSrc->au16[7]);
11443}
11444
11445#endif
11446
11447IEM_DECL_IMPL_DEF(void, iemAImpl_pavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11448{
11449 RTUINT128U uSrc1 = *puDst;
11450
11451 puDst->au8[ 0] = PAVGB_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11452 puDst->au8[ 1] = PAVGB_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11453 puDst->au8[ 2] = PAVGB_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11454 puDst->au8[ 3] = PAVGB_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11455 puDst->au8[ 4] = PAVGB_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11456 puDst->au8[ 5] = PAVGB_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11457 puDst->au8[ 6] = PAVGB_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11458 puDst->au8[ 7] = PAVGB_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11459 puDst->au8[ 8] = PAVGB_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11460 puDst->au8[ 9] = PAVGB_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11461 puDst->au8[10] = PAVGB_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11462 puDst->au8[11] = PAVGB_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11463 puDst->au8[12] = PAVGB_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11464 puDst->au8[13] = PAVGB_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11465 puDst->au8[14] = PAVGB_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11466 puDst->au8[15] = PAVGB_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11467}
11468
11469
11470IEM_DECL_IMPL_DEF(void, iemAImpl_pavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11471{
11472 RTUINT128U uSrc1 = *puDst;
11473
11474 puDst->au8[ 0] = PAVGW_EXEC(uSrc1.au8[ 0], puSrc->au8[ 0]);
11475 puDst->au8[ 1] = PAVGW_EXEC(uSrc1.au8[ 1], puSrc->au8[ 1]);
11476 puDst->au8[ 2] = PAVGW_EXEC(uSrc1.au8[ 2], puSrc->au8[ 2]);
11477 puDst->au8[ 3] = PAVGW_EXEC(uSrc1.au8[ 3], puSrc->au8[ 3]);
11478 puDst->au8[ 4] = PAVGW_EXEC(uSrc1.au8[ 4], puSrc->au8[ 4]);
11479 puDst->au8[ 5] = PAVGW_EXEC(uSrc1.au8[ 5], puSrc->au8[ 5]);
11480 puDst->au8[ 6] = PAVGW_EXEC(uSrc1.au8[ 6], puSrc->au8[ 6]);
11481 puDst->au8[ 7] = PAVGW_EXEC(uSrc1.au8[ 7], puSrc->au8[ 7]);
11482 puDst->au8[ 8] = PAVGW_EXEC(uSrc1.au8[ 8], puSrc->au8[ 8]);
11483 puDst->au8[ 9] = PAVGW_EXEC(uSrc1.au8[ 9], puSrc->au8[ 9]);
11484 puDst->au8[10] = PAVGW_EXEC(uSrc1.au8[10], puSrc->au8[10]);
11485 puDst->au8[11] = PAVGW_EXEC(uSrc1.au8[11], puSrc->au8[11]);
11486 puDst->au8[12] = PAVGW_EXEC(uSrc1.au8[12], puSrc->au8[12]);
11487 puDst->au8[13] = PAVGW_EXEC(uSrc1.au8[13], puSrc->au8[13]);
11488 puDst->au8[14] = PAVGW_EXEC(uSrc1.au8[14], puSrc->au8[14]);
11489 puDst->au8[15] = PAVGW_EXEC(uSrc1.au8[15], puSrc->au8[15]);
11490}
11491
11492
11493IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11494{
11495 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11496 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11497 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11498 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11499 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11500 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11501 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11502 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11503 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11504 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11505 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11506 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11507 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11508 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11509 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11510 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11511}
11512
11513
11514IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11515{
11516 puDst->au8[ 0] = PAVGB_EXEC(puSrc1->au8[ 0], puSrc2->au8[ 0]);
11517 puDst->au8[ 1] = PAVGB_EXEC(puSrc1->au8[ 1], puSrc2->au8[ 1]);
11518 puDst->au8[ 2] = PAVGB_EXEC(puSrc1->au8[ 2], puSrc2->au8[ 2]);
11519 puDst->au8[ 3] = PAVGB_EXEC(puSrc1->au8[ 3], puSrc2->au8[ 3]);
11520 puDst->au8[ 4] = PAVGB_EXEC(puSrc1->au8[ 4], puSrc2->au8[ 4]);
11521 puDst->au8[ 5] = PAVGB_EXEC(puSrc1->au8[ 5], puSrc2->au8[ 5]);
11522 puDst->au8[ 6] = PAVGB_EXEC(puSrc1->au8[ 6], puSrc2->au8[ 6]);
11523 puDst->au8[ 7] = PAVGB_EXEC(puSrc1->au8[ 7], puSrc2->au8[ 7]);
11524 puDst->au8[ 8] = PAVGB_EXEC(puSrc1->au8[ 8], puSrc2->au8[ 8]);
11525 puDst->au8[ 9] = PAVGB_EXEC(puSrc1->au8[ 9], puSrc2->au8[ 9]);
11526 puDst->au8[10] = PAVGB_EXEC(puSrc1->au8[10], puSrc2->au8[10]);
11527 puDst->au8[11] = PAVGB_EXEC(puSrc1->au8[11], puSrc2->au8[11]);
11528 puDst->au8[12] = PAVGB_EXEC(puSrc1->au8[12], puSrc2->au8[12]);
11529 puDst->au8[13] = PAVGB_EXEC(puSrc1->au8[13], puSrc2->au8[13]);
11530 puDst->au8[14] = PAVGB_EXEC(puSrc1->au8[14], puSrc2->au8[14]);
11531 puDst->au8[15] = PAVGB_EXEC(puSrc1->au8[15], puSrc2->au8[15]);
11532 puDst->au8[16] = PAVGB_EXEC(puSrc1->au8[16], puSrc2->au8[16]);
11533 puDst->au8[17] = PAVGB_EXEC(puSrc1->au8[17], puSrc2->au8[17]);
11534 puDst->au8[18] = PAVGB_EXEC(puSrc1->au8[18], puSrc2->au8[18]);
11535 puDst->au8[19] = PAVGB_EXEC(puSrc1->au8[19], puSrc2->au8[19]);
11536 puDst->au8[20] = PAVGB_EXEC(puSrc1->au8[20], puSrc2->au8[20]);
11537 puDst->au8[21] = PAVGB_EXEC(puSrc1->au8[21], puSrc2->au8[21]);
11538 puDst->au8[22] = PAVGB_EXEC(puSrc1->au8[22], puSrc2->au8[22]);
11539 puDst->au8[23] = PAVGB_EXEC(puSrc1->au8[23], puSrc2->au8[23]);
11540 puDst->au8[24] = PAVGB_EXEC(puSrc1->au8[24], puSrc2->au8[24]);
11541 puDst->au8[25] = PAVGB_EXEC(puSrc1->au8[25], puSrc2->au8[25]);
11542 puDst->au8[26] = PAVGB_EXEC(puSrc1->au8[26], puSrc2->au8[26]);
11543 puDst->au8[27] = PAVGB_EXEC(puSrc1->au8[27], puSrc2->au8[27]);
11544 puDst->au8[28] = PAVGB_EXEC(puSrc1->au8[28], puSrc2->au8[28]);
11545 puDst->au8[29] = PAVGB_EXEC(puSrc1->au8[29], puSrc2->au8[29]);
11546 puDst->au8[30] = PAVGB_EXEC(puSrc1->au8[30], puSrc2->au8[30]);
11547 puDst->au8[31] = PAVGB_EXEC(puSrc1->au8[31], puSrc2->au8[31]);
11548}
11549
11550
11551IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11552{
11553 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11554 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11555 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11556 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11557 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11558 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11559 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11560 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11561}
11562
11563
11564IEM_DECL_IMPL_DEF(void, iemAImpl_vpavgw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11565{
11566 puDst->au16[ 0] = PAVGW_EXEC(puSrc1->au16[ 0], puSrc2->au16[ 0]);
11567 puDst->au16[ 1] = PAVGW_EXEC(puSrc1->au16[ 1], puSrc2->au16[ 1]);
11568 puDst->au16[ 2] = PAVGW_EXEC(puSrc1->au16[ 2], puSrc2->au16[ 2]);
11569 puDst->au16[ 3] = PAVGW_EXEC(puSrc1->au16[ 3], puSrc2->au16[ 3]);
11570 puDst->au16[ 4] = PAVGW_EXEC(puSrc1->au16[ 4], puSrc2->au16[ 4]);
11571 puDst->au16[ 5] = PAVGW_EXEC(puSrc1->au16[ 5], puSrc2->au16[ 5]);
11572 puDst->au16[ 6] = PAVGW_EXEC(puSrc1->au16[ 6], puSrc2->au16[ 6]);
11573 puDst->au16[ 7] = PAVGW_EXEC(puSrc1->au16[ 7], puSrc2->au16[ 7]);
11574 puDst->au16[ 8] = PAVGW_EXEC(puSrc1->au16[ 8], puSrc2->au16[ 8]);
11575 puDst->au16[ 9] = PAVGW_EXEC(puSrc1->au16[ 9], puSrc2->au16[ 9]);
11576 puDst->au16[10] = PAVGW_EXEC(puSrc1->au16[10], puSrc2->au16[10]);
11577 puDst->au16[11] = PAVGW_EXEC(puSrc1->au16[11], puSrc2->au16[11]);
11578 puDst->au16[12] = PAVGW_EXEC(puSrc1->au16[12], puSrc2->au16[12]);
11579 puDst->au16[13] = PAVGW_EXEC(puSrc1->au16[13], puSrc2->au16[13]);
11580 puDst->au16[14] = PAVGW_EXEC(puSrc1->au16[14], puSrc2->au16[14]);
11581 puDst->au16[15] = PAVGW_EXEC(puSrc1->au16[15], puSrc2->au16[15]);
11582}
11583
11584#undef PAVGB_EXEC
11585#undef PAVGW_EXEC
11586
11587
11588/*
11589 * PMOVMSKB / VPMOVMSKB
11590 */
11591#ifdef IEM_WITHOUT_ASSEMBLY
11592
11593IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u64,(uint64_t *pu64Dst, uint64_t const *pu64Src))
11594{
11595 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11596 uint64_t const uSrc = *pu64Src;
11597 *pu64Dst = ((uSrc >> ( 7-0)) & RT_BIT_64(0))
11598 | ((uSrc >> (15-1)) & RT_BIT_64(1))
11599 | ((uSrc >> (23-2)) & RT_BIT_64(2))
11600 | ((uSrc >> (31-3)) & RT_BIT_64(3))
11601 | ((uSrc >> (39-4)) & RT_BIT_64(4))
11602 | ((uSrc >> (47-5)) & RT_BIT_64(5))
11603 | ((uSrc >> (55-6)) & RT_BIT_64(6))
11604 | ((uSrc >> (63-7)) & RT_BIT_64(7));
11605}
11606
11607
11608IEM_DECL_IMPL_DEF(void, iemAImpl_pmovmskb_u128,(uint64_t *pu64Dst, PCRTUINT128U pu128Src))
11609{
11610 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11611 uint64_t const uSrc0 = pu128Src->QWords.qw0;
11612 uint64_t const uSrc1 = pu128Src->QWords.qw1;
11613 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11614 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11615 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11616 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11617 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11618 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11619 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11620 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11621 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11622 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11623 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11624 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11625 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11626 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11627 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11628 | ((uSrc1 >> (63-15)) & RT_BIT_64(15));
11629}
11630
11631#endif
11632
11633IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovmskb_u256_fallback,(uint64_t *pu64Dst, PCRTUINT256U puSrc))
11634{
11635 /* The the most signficant bit from each byte and store them in the given general purpose register. */
11636 uint64_t const uSrc0 = puSrc->QWords.qw0;
11637 uint64_t const uSrc1 = puSrc->QWords.qw1;
11638 uint64_t const uSrc2 = puSrc->QWords.qw2;
11639 uint64_t const uSrc3 = puSrc->QWords.qw3;
11640 *pu64Dst = ((uSrc0 >> ( 7-0)) & RT_BIT_64(0))
11641 | ((uSrc0 >> (15-1)) & RT_BIT_64(1))
11642 | ((uSrc0 >> (23-2)) & RT_BIT_64(2))
11643 | ((uSrc0 >> (31-3)) & RT_BIT_64(3))
11644 | ((uSrc0 >> (39-4)) & RT_BIT_64(4))
11645 | ((uSrc0 >> (47-5)) & RT_BIT_64(5))
11646 | ((uSrc0 >> (55-6)) & RT_BIT_64(6))
11647 | ((uSrc0 >> (63-7)) & RT_BIT_64(7))
11648 | ((uSrc1 << (1 /*7-8*/)) & RT_BIT_64(8))
11649 | ((uSrc1 >> (15-9)) & RT_BIT_64(9))
11650 | ((uSrc1 >> (23-10)) & RT_BIT_64(10))
11651 | ((uSrc1 >> (31-11)) & RT_BIT_64(11))
11652 | ((uSrc1 >> (39-12)) & RT_BIT_64(12))
11653 | ((uSrc1 >> (47-13)) & RT_BIT_64(13))
11654 | ((uSrc1 >> (55-14)) & RT_BIT_64(14))
11655 | ((uSrc1 >> (63-15)) & RT_BIT_64(15))
11656 | ((uSrc2 << (9 /* 7-16*/)) & RT_BIT_64(16))
11657 | ((uSrc2 << (2 /*15-17*/)) & RT_BIT_64(17))
11658 | ((uSrc2 >> (23-18)) & RT_BIT_64(18))
11659 | ((uSrc2 >> (31-19)) & RT_BIT_64(19))
11660 | ((uSrc2 >> (39-20)) & RT_BIT_64(20))
11661 | ((uSrc2 >> (47-21)) & RT_BIT_64(21))
11662 | ((uSrc2 >> (55-22)) & RT_BIT_64(22))
11663 | ((uSrc2 >> (63-23)) & RT_BIT_64(23))
11664 | ((uSrc3 << (17 /* 7-24*/)) & RT_BIT_64(24))
11665 | ((uSrc3 << (10 /*15-25*/)) & RT_BIT_64(25))
11666 | ((uSrc3 << (3 /*23-26*/)) & RT_BIT_64(26))
11667 | ((uSrc3 >> (31-27)) & RT_BIT_64(27))
11668 | ((uSrc3 >> (39-28)) & RT_BIT_64(28))
11669 | ((uSrc3 >> (47-29)) & RT_BIT_64(29))
11670 | ((uSrc3 >> (55-30)) & RT_BIT_64(30))
11671 | ((uSrc3 >> (63-31)) & RT_BIT_64(31));
11672}
11673
11674
11675/*
11676 * [V]PSHUFB
11677 */
11678
11679IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
11680{
11681 RTUINT64U const uSrc = { *puSrc };
11682 RTUINT64U const uDstIn = { *puDst };
11683 ASMCompilerBarrier();
11684 RTUINT64U uDstOut = { 0 };
11685 for (unsigned iByte = 0; iByte < RT_ELEMENTS(uDstIn.au8); iByte++)
11686 {
11687 uint8_t idxSrc = uSrc.au8[iByte];
11688 if (!(idxSrc & 0x80))
11689 uDstOut.au8[iByte] = uDstIn.au8[idxSrc & 7];
11690 }
11691 *puDst = uDstOut.u;
11692 RT_NOREF(pFpuState);
11693}
11694
11695
11696IEM_DECL_IMPL_DEF(void, iemAImpl_pshufb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
11697{
11698 RTUINT128U const uSrc = *puSrc;
11699 RTUINT128U const uDstIn = *puDst;
11700 ASMCompilerBarrier();
11701 puDst->au64[0] = 0;
11702 puDst->au64[1] = 0;
11703 for (unsigned iByte = 0; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11704 {
11705 uint8_t idxSrc = uSrc.au8[iByte];
11706 if (!(idxSrc & 0x80))
11707 puDst->au8[iByte] = uDstIn.au8[idxSrc & 15];
11708 }
11709 RT_NOREF(pFpuState);
11710}
11711
11712
11713IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u128_fallback,(PX86XSAVEAREA pExtState, PRTUINT128U puDst,
11714 PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11715{
11716 RTUINT128U const uSrc1 = *puSrc1; /* could be same as puDst */
11717 RTUINT128U const uSrc2 = *puSrc2; /* could be same as puDst */
11718 ASMCompilerBarrier();
11719 puDst->au64[0] = 0;
11720 puDst->au64[1] = 0;
11721 for (unsigned iByte = 0; iByte < 16; iByte++)
11722 {
11723 uint8_t idxSrc = uSrc2.au8[iByte];
11724 if (!(idxSrc & 0x80))
11725 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11726 }
11727 RT_NOREF(pExtState);
11728}
11729
11730
11731IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufb_u256_fallback,(PX86XSAVEAREA pExtState, PRTUINT256U puDst,
11732 PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11733{
11734 RTUINT256U const uSrc1 = *puSrc1; /* could be same as puDst */
11735 RTUINT256U const uSrc2 = *puSrc2; /* could be same as puDst */
11736 ASMCompilerBarrier();
11737 puDst->au64[0] = 0;
11738 puDst->au64[1] = 0;
11739 puDst->au64[2] = 0;
11740 puDst->au64[3] = 0;
11741 for (unsigned iByte = 0; iByte < 16; iByte++)
11742 {
11743 uint8_t idxSrc = uSrc2.au8[iByte];
11744 if (!(idxSrc & 0x80))
11745 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15)];
11746 }
11747 for (unsigned iByte = 16; iByte < RT_ELEMENTS(puDst->au8); iByte++)
11748 {
11749 uint8_t idxSrc = uSrc2.au8[iByte];
11750 if (!(idxSrc & 0x80))
11751 puDst->au8[iByte] = uSrc1.au8[(idxSrc & 15) + 16]; /* baka intel */
11752 }
11753 RT_NOREF(pExtState);
11754}
11755
11756
11757/*
11758 * PSHUFW, [V]PSHUFHW, [V]PSHUFLW, [V]PSHUFD
11759 */
11760#ifdef IEM_WITHOUT_ASSEMBLY
11761
11762IEM_DECL_IMPL_DEF(void, iemAImpl_pshufw_u64,(uint64_t *puDst, uint64_t const *puSrc, uint8_t bEvil))
11763{
11764 uint64_t const uSrc = *puSrc;
11765 ASMCompilerBarrier();
11766 *puDst = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11767 uSrc >> (((bEvil >> 2) & 3) * 16),
11768 uSrc >> (((bEvil >> 4) & 3) * 16),
11769 uSrc >> (((bEvil >> 6) & 3) * 16));
11770}
11771
11772
11773IEM_DECL_IMPL_DEF(void, iemAImpl_pshufhw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11774{
11775 puDst->QWords.qw0 = puSrc->QWords.qw0;
11776 uint64_t const uSrc = puSrc->QWords.qw1;
11777 ASMCompilerBarrier();
11778 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11779 uSrc >> (((bEvil >> 2) & 3) * 16),
11780 uSrc >> (((bEvil >> 4) & 3) * 16),
11781 uSrc >> (((bEvil >> 6) & 3) * 16));
11782}
11783
11784#endif
11785
11786IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufhw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11787{
11788 puDst->QWords.qw0 = puSrc->QWords.qw0;
11789 uint64_t const uSrc1 = puSrc->QWords.qw1;
11790 puDst->QWords.qw2 = puSrc->QWords.qw2;
11791 uint64_t const uSrc3 = puSrc->QWords.qw3;
11792 ASMCompilerBarrier();
11793 puDst->QWords.qw1 = RT_MAKE_U64_FROM_U16(uSrc1 >> (( bEvil & 3) * 16),
11794 uSrc1 >> (((bEvil >> 2) & 3) * 16),
11795 uSrc1 >> (((bEvil >> 4) & 3) * 16),
11796 uSrc1 >> (((bEvil >> 6) & 3) * 16));
11797 puDst->QWords.qw3 = RT_MAKE_U64_FROM_U16(uSrc3 >> (( bEvil & 3) * 16),
11798 uSrc3 >> (((bEvil >> 2) & 3) * 16),
11799 uSrc3 >> (((bEvil >> 4) & 3) * 16),
11800 uSrc3 >> (((bEvil >> 6) & 3) * 16));
11801}
11802
11803#ifdef IEM_WITHOUT_ASSEMBLY
11804IEM_DECL_IMPL_DEF(void, iemAImpl_pshuflw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11805{
11806 puDst->QWords.qw1 = puSrc->QWords.qw1;
11807 uint64_t const uSrc = puSrc->QWords.qw0;
11808 ASMCompilerBarrier();
11809 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc >> (( bEvil & 3) * 16),
11810 uSrc >> (((bEvil >> 2) & 3) * 16),
11811 uSrc >> (((bEvil >> 4) & 3) * 16),
11812 uSrc >> (((bEvil >> 6) & 3) * 16));
11813
11814}
11815#endif
11816
11817
11818IEM_DECL_IMPL_DEF(void, iemAImpl_vpshuflw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11819{
11820 puDst->QWords.qw3 = puSrc->QWords.qw3;
11821 uint64_t const uSrc2 = puSrc->QWords.qw2;
11822 puDst->QWords.qw1 = puSrc->QWords.qw1;
11823 uint64_t const uSrc0 = puSrc->QWords.qw0;
11824 ASMCompilerBarrier();
11825 puDst->QWords.qw0 = RT_MAKE_U64_FROM_U16(uSrc0 >> (( bEvil & 3) * 16),
11826 uSrc0 >> (((bEvil >> 2) & 3) * 16),
11827 uSrc0 >> (((bEvil >> 4) & 3) * 16),
11828 uSrc0 >> (((bEvil >> 6) & 3) * 16));
11829 puDst->QWords.qw2 = RT_MAKE_U64_FROM_U16(uSrc2 >> (( bEvil & 3) * 16),
11830 uSrc2 >> (((bEvil >> 2) & 3) * 16),
11831 uSrc2 >> (((bEvil >> 4) & 3) * 16),
11832 uSrc2 >> (((bEvil >> 6) & 3) * 16));
11833
11834}
11835
11836
11837#ifdef IEM_WITHOUT_ASSEMBLY
11838IEM_DECL_IMPL_DEF(void, iemAImpl_pshufd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
11839{
11840 RTUINT128U const uSrc = *puSrc;
11841 ASMCompilerBarrier();
11842 puDst->au32[0] = uSrc.au32[bEvil & 3];
11843 puDst->au32[1] = uSrc.au32[(bEvil >> 2) & 3];
11844 puDst->au32[2] = uSrc.au32[(bEvil >> 4) & 3];
11845 puDst->au32[3] = uSrc.au32[(bEvil >> 6) & 3];
11846}
11847#endif
11848
11849
11850IEM_DECL_IMPL_DEF(void, iemAImpl_vpshufd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc, uint8_t bEvil))
11851{
11852 RTUINT256U const uSrc = *puSrc;
11853 ASMCompilerBarrier();
11854 puDst->au128[0].au32[0] = uSrc.au128[0].au32[bEvil & 3];
11855 puDst->au128[0].au32[1] = uSrc.au128[0].au32[(bEvil >> 2) & 3];
11856 puDst->au128[0].au32[2] = uSrc.au128[0].au32[(bEvil >> 4) & 3];
11857 puDst->au128[0].au32[3] = uSrc.au128[0].au32[(bEvil >> 6) & 3];
11858 puDst->au128[1].au32[0] = uSrc.au128[1].au32[bEvil & 3];
11859 puDst->au128[1].au32[1] = uSrc.au128[1].au32[(bEvil >> 2) & 3];
11860 puDst->au128[1].au32[2] = uSrc.au128[1].au32[(bEvil >> 4) & 3];
11861 puDst->au128[1].au32[3] = uSrc.au128[1].au32[(bEvil >> 6) & 3];
11862}
11863
11864
11865/*
11866 * PUNPCKHBW - high bytes -> words
11867 */
11868#ifdef IEM_WITHOUT_ASSEMBLY
11869
11870IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
11871{
11872 RTUINT64U const uSrc2 = { *puSrc };
11873 RTUINT64U const uSrc1 = { *puDst };
11874 ASMCompilerBarrier();
11875 RTUINT64U uDstOut;
11876 uDstOut.au8[0] = uSrc1.au8[4];
11877 uDstOut.au8[1] = uSrc2.au8[4];
11878 uDstOut.au8[2] = uSrc1.au8[5];
11879 uDstOut.au8[3] = uSrc2.au8[5];
11880 uDstOut.au8[4] = uSrc1.au8[6];
11881 uDstOut.au8[5] = uSrc2.au8[6];
11882 uDstOut.au8[6] = uSrc1.au8[7];
11883 uDstOut.au8[7] = uSrc2.au8[7];
11884 *puDst = uDstOut.u;
11885}
11886
11887
11888IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
11889{
11890 RTUINT128U const uSrc2 = *puSrc;
11891 RTUINT128U const uSrc1 = *puDst;
11892 ASMCompilerBarrier();
11893 RTUINT128U uDstOut;
11894 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11895 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11896 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11897 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11898 uDstOut.au8[ 4] = uSrc1.au8[10];
11899 uDstOut.au8[ 5] = uSrc2.au8[10];
11900 uDstOut.au8[ 6] = uSrc1.au8[11];
11901 uDstOut.au8[ 7] = uSrc2.au8[11];
11902 uDstOut.au8[ 8] = uSrc1.au8[12];
11903 uDstOut.au8[ 9] = uSrc2.au8[12];
11904 uDstOut.au8[10] = uSrc1.au8[13];
11905 uDstOut.au8[11] = uSrc2.au8[13];
11906 uDstOut.au8[12] = uSrc1.au8[14];
11907 uDstOut.au8[13] = uSrc2.au8[14];
11908 uDstOut.au8[14] = uSrc1.au8[15];
11909 uDstOut.au8[15] = uSrc2.au8[15];
11910 *puDst = uDstOut;
11911}
11912
11913#endif
11914
11915IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
11916{
11917 RTUINT128U const uSrc2 = *puSrc2;
11918 RTUINT128U const uSrc1 = *puSrc1;
11919 ASMCompilerBarrier();
11920 RTUINT128U uDstOut;
11921 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11922 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11923 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11924 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11925 uDstOut.au8[ 4] = uSrc1.au8[10];
11926 uDstOut.au8[ 5] = uSrc2.au8[10];
11927 uDstOut.au8[ 6] = uSrc1.au8[11];
11928 uDstOut.au8[ 7] = uSrc2.au8[11];
11929 uDstOut.au8[ 8] = uSrc1.au8[12];
11930 uDstOut.au8[ 9] = uSrc2.au8[12];
11931 uDstOut.au8[10] = uSrc1.au8[13];
11932 uDstOut.au8[11] = uSrc2.au8[13];
11933 uDstOut.au8[12] = uSrc1.au8[14];
11934 uDstOut.au8[13] = uSrc2.au8[14];
11935 uDstOut.au8[14] = uSrc1.au8[15];
11936 uDstOut.au8[15] = uSrc2.au8[15];
11937 *puDst = uDstOut;
11938}
11939
11940
11941IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
11942{
11943 RTUINT256U const uSrc2 = *puSrc2;
11944 RTUINT256U const uSrc1 = *puSrc1;
11945 ASMCompilerBarrier();
11946 RTUINT256U uDstOut;
11947 uDstOut.au8[ 0] = uSrc1.au8[ 8];
11948 uDstOut.au8[ 1] = uSrc2.au8[ 8];
11949 uDstOut.au8[ 2] = uSrc1.au8[ 9];
11950 uDstOut.au8[ 3] = uSrc2.au8[ 9];
11951 uDstOut.au8[ 4] = uSrc1.au8[10];
11952 uDstOut.au8[ 5] = uSrc2.au8[10];
11953 uDstOut.au8[ 6] = uSrc1.au8[11];
11954 uDstOut.au8[ 7] = uSrc2.au8[11];
11955 uDstOut.au8[ 8] = uSrc1.au8[12];
11956 uDstOut.au8[ 9] = uSrc2.au8[12];
11957 uDstOut.au8[10] = uSrc1.au8[13];
11958 uDstOut.au8[11] = uSrc2.au8[13];
11959 uDstOut.au8[12] = uSrc1.au8[14];
11960 uDstOut.au8[13] = uSrc2.au8[14];
11961 uDstOut.au8[14] = uSrc1.au8[15];
11962 uDstOut.au8[15] = uSrc2.au8[15];
11963 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
11964 uDstOut.au8[16] = uSrc1.au8[24];
11965 uDstOut.au8[17] = uSrc2.au8[24];
11966 uDstOut.au8[18] = uSrc1.au8[25];
11967 uDstOut.au8[19] = uSrc2.au8[25];
11968 uDstOut.au8[20] = uSrc1.au8[26];
11969 uDstOut.au8[21] = uSrc2.au8[26];
11970 uDstOut.au8[22] = uSrc1.au8[27];
11971 uDstOut.au8[23] = uSrc2.au8[27];
11972 uDstOut.au8[24] = uSrc1.au8[28];
11973 uDstOut.au8[25] = uSrc2.au8[28];
11974 uDstOut.au8[26] = uSrc1.au8[29];
11975 uDstOut.au8[27] = uSrc2.au8[29];
11976 uDstOut.au8[28] = uSrc1.au8[30];
11977 uDstOut.au8[29] = uSrc2.au8[30];
11978 uDstOut.au8[30] = uSrc1.au8[31];
11979 uDstOut.au8[31] = uSrc2.au8[31];
11980 *puDst = uDstOut;
11981}
11982
11983
11984/*
11985 * PUNPCKHBW - high words -> dwords
11986 */
11987#ifdef IEM_WITHOUT_ASSEMBLY
11988
11989IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
11990{
11991 RTUINT64U const uSrc2 = { *puSrc };
11992 RTUINT64U const uSrc1 = { *puDst };
11993 ASMCompilerBarrier();
11994 RTUINT64U uDstOut;
11995 uDstOut.au16[0] = uSrc1.au16[2];
11996 uDstOut.au16[1] = uSrc2.au16[2];
11997 uDstOut.au16[2] = uSrc1.au16[3];
11998 uDstOut.au16[3] = uSrc2.au16[3];
11999 *puDst = uDstOut.u;
12000}
12001
12002
12003IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12004{
12005 RTUINT128U const uSrc2 = *puSrc;
12006 RTUINT128U const uSrc1 = *puDst;
12007 ASMCompilerBarrier();
12008 RTUINT128U uDstOut;
12009 uDstOut.au16[0] = uSrc1.au16[4];
12010 uDstOut.au16[1] = uSrc2.au16[4];
12011 uDstOut.au16[2] = uSrc1.au16[5];
12012 uDstOut.au16[3] = uSrc2.au16[5];
12013 uDstOut.au16[4] = uSrc1.au16[6];
12014 uDstOut.au16[5] = uSrc2.au16[6];
12015 uDstOut.au16[6] = uSrc1.au16[7];
12016 uDstOut.au16[7] = uSrc2.au16[7];
12017 *puDst = uDstOut;
12018}
12019
12020#endif
12021
12022IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12023{
12024 RTUINT128U const uSrc2 = *puSrc2;
12025 RTUINT128U const uSrc1 = *puSrc1;
12026 ASMCompilerBarrier();
12027 RTUINT128U uDstOut;
12028 uDstOut.au16[0] = uSrc1.au16[4];
12029 uDstOut.au16[1] = uSrc2.au16[4];
12030 uDstOut.au16[2] = uSrc1.au16[5];
12031 uDstOut.au16[3] = uSrc2.au16[5];
12032 uDstOut.au16[4] = uSrc1.au16[6];
12033 uDstOut.au16[5] = uSrc2.au16[6];
12034 uDstOut.au16[6] = uSrc1.au16[7];
12035 uDstOut.au16[7] = uSrc2.au16[7];
12036 *puDst = uDstOut;
12037}
12038
12039
12040IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12041{
12042 RTUINT256U const uSrc2 = *puSrc2;
12043 RTUINT256U const uSrc1 = *puSrc1;
12044 ASMCompilerBarrier();
12045 RTUINT256U uDstOut;
12046 uDstOut.au16[0] = uSrc1.au16[4];
12047 uDstOut.au16[1] = uSrc2.au16[4];
12048 uDstOut.au16[2] = uSrc1.au16[5];
12049 uDstOut.au16[3] = uSrc2.au16[5];
12050 uDstOut.au16[4] = uSrc1.au16[6];
12051 uDstOut.au16[5] = uSrc2.au16[6];
12052 uDstOut.au16[6] = uSrc1.au16[7];
12053 uDstOut.au16[7] = uSrc2.au16[7];
12054
12055 uDstOut.au16[8] = uSrc1.au16[12];
12056 uDstOut.au16[9] = uSrc2.au16[12];
12057 uDstOut.au16[10] = uSrc1.au16[13];
12058 uDstOut.au16[11] = uSrc2.au16[13];
12059 uDstOut.au16[12] = uSrc1.au16[14];
12060 uDstOut.au16[13] = uSrc2.au16[14];
12061 uDstOut.au16[14] = uSrc1.au16[15];
12062 uDstOut.au16[15] = uSrc2.au16[15];
12063 *puDst = uDstOut;
12064}
12065
12066
12067/*
12068 * PUNPCKHBW - high dwords -> qword(s)
12069 */
12070#ifdef IEM_WITHOUT_ASSEMBLY
12071
12072IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12073{
12074 RTUINT64U const uSrc2 = { *puSrc };
12075 RTUINT64U const uSrc1 = { *puDst };
12076 ASMCompilerBarrier();
12077 RTUINT64U uDstOut;
12078 uDstOut.au32[0] = uSrc1.au32[1];
12079 uDstOut.au32[1] = uSrc2.au32[1];
12080 *puDst = uDstOut.u;
12081}
12082
12083
12084IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12085{
12086 RTUINT128U const uSrc2 = *puSrc;
12087 RTUINT128U const uSrc1 = *puDst;
12088 ASMCompilerBarrier();
12089 RTUINT128U uDstOut;
12090 uDstOut.au32[0] = uSrc1.au32[2];
12091 uDstOut.au32[1] = uSrc2.au32[2];
12092 uDstOut.au32[2] = uSrc1.au32[3];
12093 uDstOut.au32[3] = uSrc2.au32[3];
12094 *puDst = uDstOut;
12095}
12096
12097#endif
12098
12099IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12100{
12101 RTUINT128U const uSrc2 = *puSrc2;
12102 RTUINT128U const uSrc1 = *puSrc1;
12103 ASMCompilerBarrier();
12104 RTUINT128U uDstOut;
12105 uDstOut.au32[0] = uSrc1.au32[2];
12106 uDstOut.au32[1] = uSrc2.au32[2];
12107 uDstOut.au32[2] = uSrc1.au32[3];
12108 uDstOut.au32[3] = uSrc2.au32[3];
12109 *puDst = uDstOut;
12110}
12111
12112
12113IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12114{
12115 RTUINT256U const uSrc2 = *puSrc2;
12116 RTUINT256U const uSrc1 = *puSrc1;
12117 ASMCompilerBarrier();
12118 RTUINT256U uDstOut;
12119 uDstOut.au32[0] = uSrc1.au32[2];
12120 uDstOut.au32[1] = uSrc2.au32[2];
12121 uDstOut.au32[2] = uSrc1.au32[3];
12122 uDstOut.au32[3] = uSrc2.au32[3];
12123
12124 uDstOut.au32[4] = uSrc1.au32[6];
12125 uDstOut.au32[5] = uSrc2.au32[6];
12126 uDstOut.au32[6] = uSrc1.au32[7];
12127 uDstOut.au32[7] = uSrc2.au32[7];
12128 *puDst = uDstOut;
12129}
12130
12131
12132/*
12133 * PUNPCKHQDQ -> High qwords -> double qword(s).
12134 */
12135#ifdef IEM_WITHOUT_ASSEMBLY
12136IEM_DECL_IMPL_DEF(void, iemAImpl_punpckhqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12137{
12138 RTUINT128U const uSrc2 = *puSrc;
12139 RTUINT128U const uSrc1 = *puDst;
12140 ASMCompilerBarrier();
12141 RTUINT128U uDstOut;
12142 uDstOut.au64[0] = uSrc1.au64[1];
12143 uDstOut.au64[1] = uSrc2.au64[1];
12144 *puDst = uDstOut;
12145}
12146#endif
12147
12148
12149IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12150{
12151 RTUINT128U const uSrc2 = *puSrc2;
12152 RTUINT128U const uSrc1 = *puSrc1;
12153 ASMCompilerBarrier();
12154 RTUINT128U uDstOut;
12155 uDstOut.au64[0] = uSrc1.au64[1];
12156 uDstOut.au64[1] = uSrc2.au64[1];
12157 *puDst = uDstOut;
12158}
12159
12160
12161IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckhqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12162{
12163 RTUINT256U const uSrc2 = *puSrc2;
12164 RTUINT256U const uSrc1 = *puSrc1;
12165 ASMCompilerBarrier();
12166 RTUINT256U uDstOut;
12167 uDstOut.au64[0] = uSrc1.au64[1];
12168 uDstOut.au64[1] = uSrc2.au64[1];
12169
12170 uDstOut.au64[2] = uSrc1.au64[3];
12171 uDstOut.au64[3] = uSrc2.au64[3];
12172 *puDst = uDstOut;
12173}
12174
12175
12176/*
12177 * PUNPCKLBW - low bytes -> words
12178 */
12179#ifdef IEM_WITHOUT_ASSEMBLY
12180
12181IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12182{
12183 RTUINT64U const uSrc2 = { *puSrc };
12184 RTUINT64U const uSrc1 = { *puDst };
12185 ASMCompilerBarrier();
12186 RTUINT64U uDstOut;
12187 uDstOut.au8[0] = uSrc1.au8[0];
12188 uDstOut.au8[1] = uSrc2.au8[0];
12189 uDstOut.au8[2] = uSrc1.au8[1];
12190 uDstOut.au8[3] = uSrc2.au8[1];
12191 uDstOut.au8[4] = uSrc1.au8[2];
12192 uDstOut.au8[5] = uSrc2.au8[2];
12193 uDstOut.au8[6] = uSrc1.au8[3];
12194 uDstOut.au8[7] = uSrc2.au8[3];
12195 *puDst = uDstOut.u;
12196}
12197
12198
12199IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12200{
12201 RTUINT128U const uSrc2 = *puSrc;
12202 RTUINT128U const uSrc1 = *puDst;
12203 ASMCompilerBarrier();
12204 RTUINT128U uDstOut;
12205 uDstOut.au8[ 0] = uSrc1.au8[0];
12206 uDstOut.au8[ 1] = uSrc2.au8[0];
12207 uDstOut.au8[ 2] = uSrc1.au8[1];
12208 uDstOut.au8[ 3] = uSrc2.au8[1];
12209 uDstOut.au8[ 4] = uSrc1.au8[2];
12210 uDstOut.au8[ 5] = uSrc2.au8[2];
12211 uDstOut.au8[ 6] = uSrc1.au8[3];
12212 uDstOut.au8[ 7] = uSrc2.au8[3];
12213 uDstOut.au8[ 8] = uSrc1.au8[4];
12214 uDstOut.au8[ 9] = uSrc2.au8[4];
12215 uDstOut.au8[10] = uSrc1.au8[5];
12216 uDstOut.au8[11] = uSrc2.au8[5];
12217 uDstOut.au8[12] = uSrc1.au8[6];
12218 uDstOut.au8[13] = uSrc2.au8[6];
12219 uDstOut.au8[14] = uSrc1.au8[7];
12220 uDstOut.au8[15] = uSrc2.au8[7];
12221 *puDst = uDstOut;
12222}
12223
12224#endif
12225
12226IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12227{
12228 RTUINT128U const uSrc2 = *puSrc2;
12229 RTUINT128U const uSrc1 = *puSrc1;
12230 ASMCompilerBarrier();
12231 RTUINT128U uDstOut;
12232 uDstOut.au8[ 0] = uSrc1.au8[0];
12233 uDstOut.au8[ 1] = uSrc2.au8[0];
12234 uDstOut.au8[ 2] = uSrc1.au8[1];
12235 uDstOut.au8[ 3] = uSrc2.au8[1];
12236 uDstOut.au8[ 4] = uSrc1.au8[2];
12237 uDstOut.au8[ 5] = uSrc2.au8[2];
12238 uDstOut.au8[ 6] = uSrc1.au8[3];
12239 uDstOut.au8[ 7] = uSrc2.au8[3];
12240 uDstOut.au8[ 8] = uSrc1.au8[4];
12241 uDstOut.au8[ 9] = uSrc2.au8[4];
12242 uDstOut.au8[10] = uSrc1.au8[5];
12243 uDstOut.au8[11] = uSrc2.au8[5];
12244 uDstOut.au8[12] = uSrc1.au8[6];
12245 uDstOut.au8[13] = uSrc2.au8[6];
12246 uDstOut.au8[14] = uSrc1.au8[7];
12247 uDstOut.au8[15] = uSrc2.au8[7];
12248 *puDst = uDstOut;
12249}
12250
12251
12252IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12253{
12254 RTUINT256U const uSrc2 = *puSrc2;
12255 RTUINT256U const uSrc1 = *puSrc1;
12256 ASMCompilerBarrier();
12257 RTUINT256U uDstOut;
12258 uDstOut.au8[ 0] = uSrc1.au8[0];
12259 uDstOut.au8[ 1] = uSrc2.au8[0];
12260 uDstOut.au8[ 2] = uSrc1.au8[1];
12261 uDstOut.au8[ 3] = uSrc2.au8[1];
12262 uDstOut.au8[ 4] = uSrc1.au8[2];
12263 uDstOut.au8[ 5] = uSrc2.au8[2];
12264 uDstOut.au8[ 6] = uSrc1.au8[3];
12265 uDstOut.au8[ 7] = uSrc2.au8[3];
12266 uDstOut.au8[ 8] = uSrc1.au8[4];
12267 uDstOut.au8[ 9] = uSrc2.au8[4];
12268 uDstOut.au8[10] = uSrc1.au8[5];
12269 uDstOut.au8[11] = uSrc2.au8[5];
12270 uDstOut.au8[12] = uSrc1.au8[6];
12271 uDstOut.au8[13] = uSrc2.au8[6];
12272 uDstOut.au8[14] = uSrc1.au8[7];
12273 uDstOut.au8[15] = uSrc2.au8[7];
12274 /* As usual, the upper 128-bits are treated like a parallel register to the lower half. */
12275 uDstOut.au8[16] = uSrc1.au8[16];
12276 uDstOut.au8[17] = uSrc2.au8[16];
12277 uDstOut.au8[18] = uSrc1.au8[17];
12278 uDstOut.au8[19] = uSrc2.au8[17];
12279 uDstOut.au8[20] = uSrc1.au8[18];
12280 uDstOut.au8[21] = uSrc2.au8[18];
12281 uDstOut.au8[22] = uSrc1.au8[19];
12282 uDstOut.au8[23] = uSrc2.au8[19];
12283 uDstOut.au8[24] = uSrc1.au8[20];
12284 uDstOut.au8[25] = uSrc2.au8[20];
12285 uDstOut.au8[26] = uSrc1.au8[21];
12286 uDstOut.au8[27] = uSrc2.au8[21];
12287 uDstOut.au8[28] = uSrc1.au8[22];
12288 uDstOut.au8[29] = uSrc2.au8[22];
12289 uDstOut.au8[30] = uSrc1.au8[23];
12290 uDstOut.au8[31] = uSrc2.au8[23];
12291 *puDst = uDstOut;
12292}
12293
12294
12295/*
12296 * PUNPCKLBW - low words -> dwords
12297 */
12298#ifdef IEM_WITHOUT_ASSEMBLY
12299
12300IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u64,(uint64_t *puDst, uint64_t const *puSrc))
12301{
12302 RTUINT64U const uSrc2 = { *puSrc };
12303 RTUINT64U const uSrc1 = { *puDst };
12304 ASMCompilerBarrier();
12305 RTUINT64U uDstOut;
12306 uDstOut.au16[0] = uSrc1.au16[0];
12307 uDstOut.au16[1] = uSrc2.au16[0];
12308 uDstOut.au16[2] = uSrc1.au16[1];
12309 uDstOut.au16[3] = uSrc2.au16[1];
12310 *puDst = uDstOut.u;
12311}
12312
12313
12314IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklwd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12315{
12316 RTUINT128U const uSrc2 = *puSrc;
12317 RTUINT128U const uSrc1 = *puDst;
12318 ASMCompilerBarrier();
12319 RTUINT128U uDstOut;
12320 uDstOut.au16[0] = uSrc1.au16[0];
12321 uDstOut.au16[1] = uSrc2.au16[0];
12322 uDstOut.au16[2] = uSrc1.au16[1];
12323 uDstOut.au16[3] = uSrc2.au16[1];
12324 uDstOut.au16[4] = uSrc1.au16[2];
12325 uDstOut.au16[5] = uSrc2.au16[2];
12326 uDstOut.au16[6] = uSrc1.au16[3];
12327 uDstOut.au16[7] = uSrc2.au16[3];
12328 *puDst = uDstOut;
12329}
12330
12331#endif
12332
12333IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12334{
12335 RTUINT128U const uSrc2 = *puSrc2;
12336 RTUINT128U const uSrc1 = *puSrc1;
12337 ASMCompilerBarrier();
12338 RTUINT128U uDstOut;
12339 uDstOut.au16[0] = uSrc1.au16[0];
12340 uDstOut.au16[1] = uSrc2.au16[0];
12341 uDstOut.au16[2] = uSrc1.au16[1];
12342 uDstOut.au16[3] = uSrc2.au16[1];
12343 uDstOut.au16[4] = uSrc1.au16[2];
12344 uDstOut.au16[5] = uSrc2.au16[2];
12345 uDstOut.au16[6] = uSrc1.au16[3];
12346 uDstOut.au16[7] = uSrc2.au16[3];
12347 *puDst = uDstOut;
12348}
12349
12350
12351IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12352{
12353 RTUINT256U const uSrc2 = *puSrc2;
12354 RTUINT256U const uSrc1 = *puSrc1;
12355 ASMCompilerBarrier();
12356 RTUINT256U uDstOut;
12357 uDstOut.au16[0] = uSrc1.au16[0];
12358 uDstOut.au16[1] = uSrc2.au16[0];
12359 uDstOut.au16[2] = uSrc1.au16[1];
12360 uDstOut.au16[3] = uSrc2.au16[1];
12361 uDstOut.au16[4] = uSrc1.au16[2];
12362 uDstOut.au16[5] = uSrc2.au16[2];
12363 uDstOut.au16[6] = uSrc1.au16[3];
12364 uDstOut.au16[7] = uSrc2.au16[3];
12365
12366 uDstOut.au16[8] = uSrc1.au16[8];
12367 uDstOut.au16[9] = uSrc2.au16[8];
12368 uDstOut.au16[10] = uSrc1.au16[9];
12369 uDstOut.au16[11] = uSrc2.au16[9];
12370 uDstOut.au16[12] = uSrc1.au16[10];
12371 uDstOut.au16[13] = uSrc2.au16[10];
12372 uDstOut.au16[14] = uSrc1.au16[11];
12373 uDstOut.au16[15] = uSrc2.au16[11];
12374 *puDst = uDstOut;
12375}
12376
12377
12378/*
12379 * PUNPCKLBW - low dwords -> qword(s)
12380 */
12381#ifdef IEM_WITHOUT_ASSEMBLY
12382
12383IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u64,(uint64_t *puDst, uint64_t const *puSrc))
12384{
12385 RTUINT64U const uSrc2 = { *puSrc };
12386 RTUINT64U const uSrc1 = { *puDst };
12387 ASMCompilerBarrier();
12388 RTUINT64U uDstOut;
12389 uDstOut.au32[0] = uSrc1.au32[0];
12390 uDstOut.au32[1] = uSrc2.au32[0];
12391 *puDst = uDstOut.u;
12392}
12393
12394
12395IEM_DECL_IMPL_DEF(void, iemAImpl_punpckldq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12396{
12397 RTUINT128U const uSrc2 = *puSrc;
12398 RTUINT128U const uSrc1 = *puDst;
12399 ASMCompilerBarrier();
12400 RTUINT128U uDstOut;
12401 uDstOut.au32[0] = uSrc1.au32[0];
12402 uDstOut.au32[1] = uSrc2.au32[0];
12403 uDstOut.au32[2] = uSrc1.au32[1];
12404 uDstOut.au32[3] = uSrc2.au32[1];
12405 *puDst = uDstOut;
12406}
12407
12408#endif
12409
12410IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12411{
12412 RTUINT128U const uSrc2 = *puSrc2;
12413 RTUINT128U const uSrc1 = *puSrc1;
12414 ASMCompilerBarrier();
12415 RTUINT128U uDstOut;
12416 uDstOut.au32[0] = uSrc1.au32[0];
12417 uDstOut.au32[1] = uSrc2.au32[0];
12418 uDstOut.au32[2] = uSrc1.au32[1];
12419 uDstOut.au32[3] = uSrc2.au32[1];
12420 *puDst = uDstOut;
12421}
12422
12423
12424IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpckldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12425{
12426 RTUINT256U const uSrc2 = *puSrc2;
12427 RTUINT256U const uSrc1 = *puSrc1;
12428 ASMCompilerBarrier();
12429 RTUINT256U uDstOut;
12430 uDstOut.au32[0] = uSrc1.au32[0];
12431 uDstOut.au32[1] = uSrc2.au32[0];
12432 uDstOut.au32[2] = uSrc1.au32[1];
12433 uDstOut.au32[3] = uSrc2.au32[1];
12434
12435 uDstOut.au32[4] = uSrc1.au32[4];
12436 uDstOut.au32[5] = uSrc2.au32[4];
12437 uDstOut.au32[6] = uSrc1.au32[5];
12438 uDstOut.au32[7] = uSrc2.au32[5];
12439 *puDst = uDstOut;
12440}
12441
12442
12443/*
12444 * PUNPCKLQDQ -> Low qwords -> double qword(s).
12445 */
12446#ifdef IEM_WITHOUT_ASSEMBLY
12447IEM_DECL_IMPL_DEF(void, iemAImpl_punpcklqdq_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12448{
12449 RTUINT128U const uSrc2 = *puSrc;
12450 RTUINT128U const uSrc1 = *puDst;
12451 ASMCompilerBarrier();
12452 RTUINT128U uDstOut;
12453 uDstOut.au64[0] = uSrc1.au64[0];
12454 uDstOut.au64[1] = uSrc2.au64[0];
12455 *puDst = uDstOut;
12456}
12457#endif
12458
12459
12460IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12461{
12462 RTUINT128U const uSrc2 = *puSrc2;
12463 RTUINT128U const uSrc1 = *puSrc1;
12464 ASMCompilerBarrier();
12465 RTUINT128U uDstOut;
12466 uDstOut.au64[0] = uSrc1.au64[0];
12467 uDstOut.au64[1] = uSrc2.au64[0];
12468 *puDst = uDstOut;
12469}
12470
12471
12472IEM_DECL_IMPL_DEF(void, iemAImpl_vpunpcklqdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12473{
12474 RTUINT256U const uSrc2 = *puSrc2;
12475 RTUINT256U const uSrc1 = *puSrc1;
12476 ASMCompilerBarrier();
12477 RTUINT256U uDstOut;
12478 uDstOut.au64[0] = uSrc1.au64[0];
12479 uDstOut.au64[1] = uSrc2.au64[0];
12480
12481 uDstOut.au64[2] = uSrc1.au64[2];
12482 uDstOut.au64[3] = uSrc2.au64[2];
12483 *puDst = uDstOut;
12484}
12485
12486
12487/*
12488 * PACKSSWB - signed words -> signed bytes
12489 */
12490
12491#ifdef IEM_WITHOUT_ASSEMBLY
12492
12493IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12494{
12495 RTUINT64U const uSrc2 = { *puSrc };
12496 RTUINT64U const uSrc1 = { *puDst };
12497 ASMCompilerBarrier();
12498 RTUINT64U uDstOut;
12499 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12500 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12501 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12502 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12503 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12504 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12505 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12506 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12507 *puDst = uDstOut.u;
12508}
12509
12510
12511IEM_DECL_IMPL_DEF(void, iemAImpl_packsswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12512{
12513 RTUINT128U const uSrc2 = *puSrc;
12514 RTUINT128U const uSrc1 = *puDst;
12515 ASMCompilerBarrier();
12516 RTUINT128U uDstOut;
12517 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12518 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12519 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12520 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12521 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12522 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12523 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12524 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12525 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12526 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12527 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12528 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12529 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12530 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12531 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12532 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12533 *puDst = uDstOut;
12534}
12535
12536#endif
12537
12538IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12539{
12540 RTUINT128U const uSrc2 = *puSrc2;
12541 RTUINT128U const uSrc1 = *puSrc1;
12542 ASMCompilerBarrier();
12543 RTUINT128U uDstOut;
12544 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12545 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12546 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12547 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12548 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12549 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12550 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12551 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12552 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12553 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12554 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12555 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12556 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12557 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12558 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12559 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12560 *puDst = uDstOut;
12561}
12562
12563
12564IEM_DECL_IMPL_DEF(void, iemAImpl_vpacksswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12565{
12566 RTUINT256U const uSrc2 = *puSrc2;
12567 RTUINT256U const uSrc1 = *puSrc1;
12568 ASMCompilerBarrier();
12569 RTUINT256U uDstOut;
12570 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[0]);
12571 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[1]);
12572 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[2]);
12573 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[3]);
12574 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[4]);
12575 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[5]);
12576 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[6]);
12577 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[7]);
12578 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[0]);
12579 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[1]);
12580 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[2]);
12581 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[3]);
12582 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[4]);
12583 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[5]);
12584 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[6]);
12585 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[7]);
12586
12587 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 8]);
12588 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[ 9]);
12589 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[10]);
12590 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[11]);
12591 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[12]);
12592 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[13]);
12593 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[14]);
12594 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc1.au16[15]);
12595 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 8]);
12596 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[ 9]);
12597 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[10]);
12598 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[11]);
12599 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[12]);
12600 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[13]);
12601 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[14]);
12602 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_SIGNED_BYTE(uSrc2.au16[15]);
12603 *puDst = uDstOut;
12604}
12605
12606
12607/*
12608 * PACKUSWB - signed words -> unsigned bytes
12609 */
12610#define SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(a_iWord) \
12611 ( (uint16_t)(a_iWord) <= (uint16_t)0xff \
12612 ? (uint8_t)(a_iWord) \
12613 : (uint8_t)0xff * (uint8_t)((((a_iWord) >> 15) & 1) ^ 1) ) /* 0xff = UINT8_MAX; 0x00 == UINT8_MIN; source bit 15 = sign */
12614
12615#ifdef IEM_WITHOUT_ASSEMBLY
12616
12617IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u64,(uint64_t *puDst, uint64_t const *puSrc))
12618{
12619 RTUINT64U const uSrc2 = { *puSrc };
12620 RTUINT64U const uSrc1 = { *puDst };
12621 ASMCompilerBarrier();
12622 RTUINT64U uDstOut;
12623 uDstOut.au8[0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12624 uDstOut.au8[1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12625 uDstOut.au8[2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12626 uDstOut.au8[3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12627 uDstOut.au8[4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12628 uDstOut.au8[5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12629 uDstOut.au8[6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12630 uDstOut.au8[7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12631 *puDst = uDstOut.u;
12632}
12633
12634
12635IEM_DECL_IMPL_DEF(void, iemAImpl_packuswb_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12636{
12637 RTUINT128U const uSrc2 = *puSrc;
12638 RTUINT128U const uSrc1 = *puDst;
12639 ASMCompilerBarrier();
12640 RTUINT128U uDstOut;
12641 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12642 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12643 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12644 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12645 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12646 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12647 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12648 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12649 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12650 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12651 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12652 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12653 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12654 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12655 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12656 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12657 *puDst = uDstOut;
12658}
12659
12660#endif
12661
12662IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12663{
12664 RTUINT128U const uSrc2 = *puSrc2;
12665 RTUINT128U const uSrc1 = *puSrc1;
12666 ASMCompilerBarrier();
12667 RTUINT128U uDstOut;
12668 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12669 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12670 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12671 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12672 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12673 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12674 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12675 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12676 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12677 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12678 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12679 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12680 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12681 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12682 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12683 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12684 *puDst = uDstOut;
12685}
12686
12687
12688IEM_DECL_IMPL_DEF(void, iemAImpl_vpackuswb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12689{
12690 RTUINT256U const uSrc2 = *puSrc2;
12691 RTUINT256U const uSrc1 = *puSrc1;
12692 ASMCompilerBarrier();
12693 RTUINT256U uDstOut;
12694 uDstOut.au8[ 0] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[0]);
12695 uDstOut.au8[ 1] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[1]);
12696 uDstOut.au8[ 2] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[2]);
12697 uDstOut.au8[ 3] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[3]);
12698 uDstOut.au8[ 4] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[4]);
12699 uDstOut.au8[ 5] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[5]);
12700 uDstOut.au8[ 6] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[6]);
12701 uDstOut.au8[ 7] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[7]);
12702 uDstOut.au8[ 8] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[0]);
12703 uDstOut.au8[ 9] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[1]);
12704 uDstOut.au8[10] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[2]);
12705 uDstOut.au8[11] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[3]);
12706 uDstOut.au8[12] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[4]);
12707 uDstOut.au8[13] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[5]);
12708 uDstOut.au8[14] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[6]);
12709 uDstOut.au8[15] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[7]);
12710
12711 uDstOut.au8[16] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 8]);
12712 uDstOut.au8[17] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[ 9]);
12713 uDstOut.au8[18] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[10]);
12714 uDstOut.au8[19] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[11]);
12715 uDstOut.au8[20] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[12]);
12716 uDstOut.au8[21] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[13]);
12717 uDstOut.au8[22] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[14]);
12718 uDstOut.au8[23] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc1.au16[15]);
12719 uDstOut.au8[24] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 8]);
12720 uDstOut.au8[25] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[ 9]);
12721 uDstOut.au8[26] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[10]);
12722 uDstOut.au8[27] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[11]);
12723 uDstOut.au8[28] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[12]);
12724 uDstOut.au8[29] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[13]);
12725 uDstOut.au8[30] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[14]);
12726 uDstOut.au8[31] = SATURATED_SIGNED_WORD_TO_UNSIGNED_BYTE(uSrc2.au16[15]);
12727 *puDst = uDstOut;
12728}
12729
12730
12731/*
12732 * PACKSSDW - signed dwords -> signed words
12733 */
12734
12735#ifdef IEM_WITHOUT_ASSEMBLY
12736
12737IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u64,(uint64_t *puDst, uint64_t const *puSrc))
12738{
12739 RTUINT64U const uSrc2 = { *puSrc };
12740 RTUINT64U const uSrc1 = { *puDst };
12741 ASMCompilerBarrier();
12742 RTUINT64U uDstOut;
12743 uDstOut.au16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12744 uDstOut.au16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12745 uDstOut.au16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12746 uDstOut.au16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12747 *puDst = uDstOut.u;
12748}
12749
12750
12751IEM_DECL_IMPL_DEF(void, iemAImpl_packssdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12752{
12753 RTUINT128U const uSrc2 = *puSrc;
12754 RTUINT128U const uSrc1 = *puDst;
12755 ASMCompilerBarrier();
12756 RTUINT128U uDstOut;
12757 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12758 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12759 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12760 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12761 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12762 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12763 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12764 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12765 *puDst = uDstOut;
12766}
12767
12768#endif
12769
12770IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12771{
12772 RTUINT128U const uSrc2 = *puSrc2;
12773 RTUINT128U const uSrc1 = *puSrc1;
12774 ASMCompilerBarrier();
12775 RTUINT128U uDstOut;
12776 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12777 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12778 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12779 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12780 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12781 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12782 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12783 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12784 *puDst = uDstOut;
12785}
12786
12787
12788IEM_DECL_IMPL_DEF(void, iemAImpl_vpackssdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12789{
12790 RTUINT256U const uSrc2 = *puSrc2;
12791 RTUINT256U const uSrc1 = *puSrc1;
12792 ASMCompilerBarrier();
12793 RTUINT256U uDstOut;
12794 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[0]);
12795 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[1]);
12796 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[2]);
12797 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[3]);
12798 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[0]);
12799 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[1]);
12800 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[2]);
12801 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[3]);
12802
12803 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[4]);
12804 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[5]);
12805 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[6]);
12806 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.au32[7]);
12807 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[4]);
12808 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[5]);
12809 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[6]);
12810 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.au32[7]);
12811 *puDst = uDstOut;
12812}
12813
12814
12815/*
12816 * PACKUSDW - signed dwords -> unsigned words
12817 */
12818#define SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(a_iDword) \
12819 ( (uint32_t)(a_iDword) <= (uint16_t)0xffff \
12820 ? (uint16_t)(a_iDword) \
12821 : (uint16_t)0xffff * (uint16_t)((((a_iDword) >> 31) & 1) ^ 1) ) /* 0xffff = UINT16_MAX; source bit 31 = sign */
12822
12823#ifdef IEM_WITHOUT_ASSEMBLY
12824IEM_DECL_IMPL_DEF(void, iemAImpl_packusdw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12825{
12826 RTUINT128U const uSrc2 = *puSrc;
12827 RTUINT128U const uSrc1 = *puDst;
12828 ASMCompilerBarrier();
12829 RTUINT128U uDstOut;
12830 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12831 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12832 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12833 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12834 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12835 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12836 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12837 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12838 *puDst = uDstOut;
12839}
12840#endif
12841
12842IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
12843{
12844 RTUINT128U const uSrc2 = *puSrc2;
12845 RTUINT128U const uSrc1 = *puSrc1;
12846 ASMCompilerBarrier();
12847 RTUINT128U uDstOut;
12848 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12849 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12850 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12851 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12852 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12853 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12854 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12855 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12856 *puDst = uDstOut;
12857}
12858
12859
12860IEM_DECL_IMPL_DEF(void, iemAImpl_vpackusdw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
12861{
12862 RTUINT256U const uSrc2 = *puSrc2;
12863 RTUINT256U const uSrc1 = *puSrc1;
12864 ASMCompilerBarrier();
12865 RTUINT256U uDstOut;
12866 uDstOut.au16[ 0] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[0]);
12867 uDstOut.au16[ 1] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[1]);
12868 uDstOut.au16[ 2] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[2]);
12869 uDstOut.au16[ 3] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[3]);
12870 uDstOut.au16[ 4] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[0]);
12871 uDstOut.au16[ 5] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[1]);
12872 uDstOut.au16[ 6] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[2]);
12873 uDstOut.au16[ 7] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[3]);
12874
12875 uDstOut.au16[ 8] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[4]);
12876 uDstOut.au16[ 9] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[5]);
12877 uDstOut.au16[10] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[6]);
12878 uDstOut.au16[11] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc1.au32[7]);
12879 uDstOut.au16[12] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[4]);
12880 uDstOut.au16[13] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[5]);
12881 uDstOut.au16[14] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[6]);
12882 uDstOut.au16[15] = SATURATED_SIGNED_DWORD_TO_UNSIGNED_WORD(uSrc2.au32[7]);
12883 *puDst = uDstOut;
12884}
12885
12886
12887/*
12888 * [V]PABSB / [V]PABSW / [V]PABSD
12889 */
12890
12891IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12892{
12893 RTUINT64U const uSrc = { *puSrc };
12894 RTUINT64U uDstOut = { 0 };
12895
12896 uDstOut.au8[0] = RT_ABS(uSrc.ai8[0]);
12897 uDstOut.au8[1] = RT_ABS(uSrc.ai8[1]);
12898 uDstOut.au8[2] = RT_ABS(uSrc.ai8[2]);
12899 uDstOut.au8[3] = RT_ABS(uSrc.ai8[3]);
12900 uDstOut.au8[4] = RT_ABS(uSrc.ai8[4]);
12901 uDstOut.au8[5] = RT_ABS(uSrc.ai8[5]);
12902 uDstOut.au8[6] = RT_ABS(uSrc.ai8[6]);
12903 uDstOut.au8[7] = RT_ABS(uSrc.ai8[7]);
12904 *puDst = uDstOut.u;
12905 RT_NOREF(pFpuState);
12906}
12907
12908
12909IEM_DECL_IMPL_DEF(void, iemAImpl_pabsb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12910{
12911 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12912 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12913 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12914 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12915 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12916 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12917 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12918 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12919 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12920 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12921 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12922 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12923 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12924 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12925 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12926 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12927 RT_NOREF(pFpuState);
12928}
12929
12930
12931IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12932{
12933 RTUINT64U const uSrc = { *puSrc };
12934 RTUINT64U uDstOut = { 0 };
12935
12936 uDstOut.au16[0] = RT_ABS(uSrc.ai16[0]);
12937 uDstOut.au16[1] = RT_ABS(uSrc.ai16[1]);
12938 uDstOut.au16[2] = RT_ABS(uSrc.ai16[2]);
12939 uDstOut.au16[3] = RT_ABS(uSrc.ai16[3]);
12940 *puDst = uDstOut.u;
12941 RT_NOREF(pFpuState);
12942}
12943
12944
12945IEM_DECL_IMPL_DEF(void, iemAImpl_pabsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12946{
12947 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
12948 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
12949 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
12950 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
12951 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
12952 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
12953 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
12954 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
12955 RT_NOREF(pFpuState);
12956}
12957
12958
12959IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
12960{
12961 RTUINT64U const uSrc = { *puSrc };
12962 RTUINT64U uDstOut = { 0 };
12963
12964 uDstOut.au32[0] = RT_ABS(uSrc.ai32[0]);
12965 uDstOut.au32[1] = RT_ABS(uSrc.ai32[1]);
12966 *puDst = uDstOut.u;
12967 RT_NOREF(pFpuState);
12968}
12969
12970
12971IEM_DECL_IMPL_DEF(void, iemAImpl_pabsd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
12972{
12973 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
12974 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
12975 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
12976 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
12977 RT_NOREF(pFpuState);
12978}
12979
12980
12981IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
12982{
12983 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
12984 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
12985 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
12986 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
12987 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
12988 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
12989 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
12990 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
12991 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
12992 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
12993 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
12994 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
12995 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
12996 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
12997 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
12998 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
12999}
13000
13001
13002IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13003{
13004 puDst->au8[ 0] = RT_ABS(puSrc->ai8[ 0]);
13005 puDst->au8[ 1] = RT_ABS(puSrc->ai8[ 1]);
13006 puDst->au8[ 2] = RT_ABS(puSrc->ai8[ 2]);
13007 puDst->au8[ 3] = RT_ABS(puSrc->ai8[ 3]);
13008 puDst->au8[ 4] = RT_ABS(puSrc->ai8[ 4]);
13009 puDst->au8[ 5] = RT_ABS(puSrc->ai8[ 5]);
13010 puDst->au8[ 6] = RT_ABS(puSrc->ai8[ 6]);
13011 puDst->au8[ 7] = RT_ABS(puSrc->ai8[ 7]);
13012 puDst->au8[ 8] = RT_ABS(puSrc->ai8[ 8]);
13013 puDst->au8[ 9] = RT_ABS(puSrc->ai8[ 9]);
13014 puDst->au8[10] = RT_ABS(puSrc->ai8[10]);
13015 puDst->au8[11] = RT_ABS(puSrc->ai8[11]);
13016 puDst->au8[12] = RT_ABS(puSrc->ai8[12]);
13017 puDst->au8[13] = RT_ABS(puSrc->ai8[13]);
13018 puDst->au8[14] = RT_ABS(puSrc->ai8[14]);
13019 puDst->au8[15] = RT_ABS(puSrc->ai8[15]);
13020 puDst->au8[16] = RT_ABS(puSrc->ai8[16]);
13021 puDst->au8[17] = RT_ABS(puSrc->ai8[17]);
13022 puDst->au8[18] = RT_ABS(puSrc->ai8[18]);
13023 puDst->au8[19] = RT_ABS(puSrc->ai8[19]);
13024 puDst->au8[20] = RT_ABS(puSrc->ai8[20]);
13025 puDst->au8[21] = RT_ABS(puSrc->ai8[21]);
13026 puDst->au8[22] = RT_ABS(puSrc->ai8[22]);
13027 puDst->au8[23] = RT_ABS(puSrc->ai8[23]);
13028 puDst->au8[24] = RT_ABS(puSrc->ai8[24]);
13029 puDst->au8[25] = RT_ABS(puSrc->ai8[25]);
13030 puDst->au8[26] = RT_ABS(puSrc->ai8[26]);
13031 puDst->au8[27] = RT_ABS(puSrc->ai8[27]);
13032 puDst->au8[28] = RT_ABS(puSrc->ai8[28]);
13033 puDst->au8[29] = RT_ABS(puSrc->ai8[29]);
13034 puDst->au8[30] = RT_ABS(puSrc->ai8[30]);
13035 puDst->au8[31] = RT_ABS(puSrc->ai8[31]);
13036}
13037
13038
13039IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13040{
13041 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13042 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13043 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13044 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13045 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13046 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13047 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13048 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13049}
13050
13051
13052IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13053{
13054 puDst->au16[ 0] = RT_ABS(puSrc->ai16[ 0]);
13055 puDst->au16[ 1] = RT_ABS(puSrc->ai16[ 1]);
13056 puDst->au16[ 2] = RT_ABS(puSrc->ai16[ 2]);
13057 puDst->au16[ 3] = RT_ABS(puSrc->ai16[ 3]);
13058 puDst->au16[ 4] = RT_ABS(puSrc->ai16[ 4]);
13059 puDst->au16[ 5] = RT_ABS(puSrc->ai16[ 5]);
13060 puDst->au16[ 6] = RT_ABS(puSrc->ai16[ 6]);
13061 puDst->au16[ 7] = RT_ABS(puSrc->ai16[ 7]);
13062 puDst->au16[ 8] = RT_ABS(puSrc->ai16[ 8]);
13063 puDst->au16[ 9] = RT_ABS(puSrc->ai16[ 9]);
13064 puDst->au16[10] = RT_ABS(puSrc->ai16[10]);
13065 puDst->au16[11] = RT_ABS(puSrc->ai16[11]);
13066 puDst->au16[12] = RT_ABS(puSrc->ai16[12]);
13067 puDst->au16[13] = RT_ABS(puSrc->ai16[13]);
13068 puDst->au16[14] = RT_ABS(puSrc->ai16[14]);
13069 puDst->au16[15] = RT_ABS(puSrc->ai16[15]);
13070}
13071
13072
13073IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13074{
13075 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13076 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13077 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13078 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13079}
13080
13081
13082IEM_DECL_IMPL_DEF(void, iemAImpl_vpabsd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc))
13083{
13084 puDst->au32[ 0] = RT_ABS(puSrc->ai32[ 0]);
13085 puDst->au32[ 1] = RT_ABS(puSrc->ai32[ 1]);
13086 puDst->au32[ 2] = RT_ABS(puSrc->ai32[ 2]);
13087 puDst->au32[ 3] = RT_ABS(puSrc->ai32[ 3]);
13088 puDst->au32[ 4] = RT_ABS(puSrc->ai32[ 4]);
13089 puDst->au32[ 5] = RT_ABS(puSrc->ai32[ 5]);
13090 puDst->au32[ 6] = RT_ABS(puSrc->ai32[ 6]);
13091 puDst->au32[ 7] = RT_ABS(puSrc->ai32[ 7]);
13092}
13093
13094
13095/*
13096 * PSIGNB / VPSIGNB / PSIGNW / VPSIGNW / PSIGND / VPSIGND
13097 */
13098IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13099{
13100 RTUINT64U uSrc1 = { *puDst };
13101 RTUINT64U uSrc2 = { *puSrc };
13102 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13103
13104 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai8); i++)
13105 {
13106 if (uSrc2.ai8[i] < 0)
13107 uDst.ai8[i] = -uSrc1.ai8[i];
13108 else if (uSrc2.ai8[i] == 0)
13109 uDst.ai8[i] = 0;
13110 else /* uSrc2.ai8[i] > 0 */
13111 uDst.ai8[i] = uSrc1.ai8[i];
13112 }
13113
13114 *puDst = uDst.u;
13115 RT_NOREF(pFpuState);
13116}
13117
13118
13119IEM_DECL_IMPL_DEF(void, iemAImpl_psignb_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13120{
13121 RTUINT128U uSrc1 = *puDst;
13122
13123 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13124 {
13125 if (puSrc->ai8[i] < 0)
13126 puDst->ai8[i] = -uSrc1.ai8[i];
13127 else if (puSrc->ai8[i] == 0)
13128 puDst->ai8[i] = 0;
13129 else /* puSrc->ai8[i] > 0 */
13130 puDst->ai8[i] = uSrc1.ai8[i];
13131 }
13132
13133 RT_NOREF(pFpuState);
13134}
13135
13136
13137IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13138{
13139 RTUINT64U uSrc1 = { *puDst };
13140 RTUINT64U uSrc2 = { *puSrc };
13141 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13142
13143 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai16); i++)
13144 {
13145 if (uSrc2.ai16[i] < 0)
13146 uDst.ai16[i] = -uSrc1.ai16[i];
13147 else if (uSrc2.ai16[i] == 0)
13148 uDst.ai16[i] = 0;
13149 else /* uSrc2.ai16[i] > 0 */
13150 uDst.ai16[i] = uSrc1.ai16[i];
13151 }
13152
13153 *puDst = uDst.u;
13154 RT_NOREF(pFpuState);
13155}
13156
13157
13158IEM_DECL_IMPL_DEF(void, iemAImpl_psignw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13159{
13160 RTUINT128U uSrc1 = *puDst;
13161
13162 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13163 {
13164 if (puSrc->ai16[i] < 0)
13165 puDst->ai16[i] = -uSrc1.ai16[i];
13166 else if (puSrc->ai16[i] == 0)
13167 puDst->ai16[i] = 0;
13168 else /* puSrc->ai16[i] > 0 */
13169 puDst->ai16[i] = uSrc1.ai16[i];
13170 }
13171
13172 RT_NOREF(pFpuState);
13173}
13174
13175
13176IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13177{
13178 RTUINT64U uSrc1 = { *puDst };
13179 RTUINT64U uSrc2 = { *puSrc };
13180 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13181
13182 for (uint32_t i = 0; i < RT_ELEMENTS(uDst.ai32); i++)
13183 {
13184 if (uSrc2.ai32[i] < 0)
13185 uDst.ai32[i] = -uSrc1.ai32[i];
13186 else if (uSrc2.ai32[i] == 0)
13187 uDst.ai32[i] = 0;
13188 else /* uSrc2.ai32[i] > 0 */
13189 uDst.ai32[i] = uSrc1.ai32[i];
13190 }
13191
13192 *puDst = uDst.u;
13193 RT_NOREF(pFpuState);
13194}
13195
13196
13197IEM_DECL_IMPL_DEF(void, iemAImpl_psignd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13198{
13199 RTUINT128U uSrc1 = *puDst;
13200
13201 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13202 {
13203 if (puSrc->ai32[i] < 0)
13204 puDst->ai32[i] = -uSrc1.ai32[i];
13205 else if (puSrc->ai32[i] == 0)
13206 puDst->ai32[i] = 0;
13207 else /* puSrc->ai32[i] > 0 */
13208 puDst->ai32[i] = uSrc1.ai32[i];
13209 }
13210
13211 RT_NOREF(pFpuState);
13212}
13213
13214
13215IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13216{
13217 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13218 {
13219 if (puSrc2->ai8[i] < 0)
13220 puDst->ai8[i] = -puSrc1->ai8[i];
13221 else if (puSrc2->ai8[i] == 0)
13222 puDst->ai8[i] = 0;
13223 else /* puSrc2->ai8[i] > 0 */
13224 puDst->ai8[i] = puSrc1->ai8[i];
13225 }
13226}
13227
13228
13229IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13230{
13231 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai8); i++)
13232 {
13233 if (puSrc2->ai8[i] < 0)
13234 puDst->ai8[i] = -puSrc1->ai8[i];
13235 else if (puSrc2->ai8[i] == 0)
13236 puDst->ai8[i] = 0;
13237 else /* puSrc2->ai8[i] > 0 */
13238 puDst->ai8[i] = puSrc1->ai8[i];
13239 }
13240}
13241
13242
13243IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13244{
13245 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13246 {
13247 if (puSrc2->ai16[i] < 0)
13248 puDst->ai16[i] = -puSrc1->ai16[i];
13249 else if (puSrc2->ai16[i] == 0)
13250 puDst->ai16[i] = 0;
13251 else /* puSrc2->ai16[i] > 0 */
13252 puDst->ai16[i] = puSrc1->ai16[i];
13253 }
13254}
13255
13256
13257IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13258{
13259 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai16); i++)
13260 {
13261 if (puSrc2->ai16[i] < 0)
13262 puDst->ai16[i] = -puSrc1->ai16[i];
13263 else if (puSrc2->ai16[i] == 0)
13264 puDst->ai16[i] = 0;
13265 else /* puSrc2->ai16[i] > 0 */
13266 puDst->ai16[i] = puSrc1->ai16[i];
13267 }
13268}
13269
13270
13271IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13272{
13273 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13274 {
13275 if (puSrc2->ai32[i] < 0)
13276 puDst->ai32[i] = -puSrc1->ai32[i];
13277 else if (puSrc2->ai32[i] == 0)
13278 puDst->ai32[i] = 0;
13279 else /* puSrc2->ai32[i] > 0 */
13280 puDst->ai32[i] = puSrc1->ai32[i];
13281 }
13282}
13283
13284
13285IEM_DECL_IMPL_DEF(void, iemAImpl_vpsignd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13286{
13287 for (uint32_t i = 0; i < RT_ELEMENTS(puDst->ai32); i++)
13288 {
13289 if (puSrc2->ai32[i] < 0)
13290 puDst->ai32[i] = -puSrc1->ai32[i];
13291 else if (puSrc2->ai32[i] == 0)
13292 puDst->ai32[i] = 0;
13293 else /* puSrc2->ai32[i] > 0 */
13294 puDst->ai32[i] = puSrc1->ai32[i];
13295 }
13296}
13297
13298
13299/*
13300 * PHADDW / VPHADDW / PHADDD / VPHADDD
13301 */
13302IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13303{
13304 RTUINT64U uSrc1 = { *puDst };
13305 RTUINT64U uSrc2 = { *puSrc };
13306 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13307
13308 uDst.ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13309 uDst.ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13310 uDst.ai16[2] = uSrc2.ai16[0] + uSrc2.ai16[1];
13311 uDst.ai16[3] = uSrc2.ai16[2] + uSrc2.ai16[3];
13312 *puDst = uDst.u;
13313 RT_NOREF(pFpuState);
13314}
13315
13316
13317IEM_DECL_IMPL_DEF(void, iemAImpl_phaddw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13318{
13319 RTUINT128U uSrc1 = *puDst;
13320
13321 puDst->ai16[0] = uSrc1.ai16[0] + uSrc1.ai16[1];
13322 puDst->ai16[1] = uSrc1.ai16[2] + uSrc1.ai16[3];
13323 puDst->ai16[2] = uSrc1.ai16[4] + uSrc1.ai16[5];
13324 puDst->ai16[3] = uSrc1.ai16[6] + uSrc1.ai16[7];
13325
13326 puDst->ai16[4] = puSrc->ai16[0] + puSrc->ai16[1];
13327 puDst->ai16[5] = puSrc->ai16[2] + puSrc->ai16[3];
13328 puDst->ai16[6] = puSrc->ai16[4] + puSrc->ai16[5];
13329 puDst->ai16[7] = puSrc->ai16[6] + puSrc->ai16[7];
13330 RT_NOREF(pFpuState);
13331}
13332
13333
13334IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13335{
13336 RTUINT64U uSrc1 = { *puDst };
13337 RTUINT64U uSrc2 = { *puSrc };
13338 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13339
13340 uDst.ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13341 uDst.ai32[1] = uSrc2.ai32[0] + uSrc2.ai32[1];
13342 *puDst = uDst.u;
13343 RT_NOREF(pFpuState);
13344}
13345
13346
13347IEM_DECL_IMPL_DEF(void, iemAImpl_phaddd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13348{
13349 RTUINT128U uSrc1 = *puDst;
13350
13351 puDst->ai32[0] = uSrc1.ai32[0] + uSrc1.ai32[1];
13352 puDst->ai32[1] = uSrc1.ai32[2] + uSrc1.ai32[3];
13353
13354 puDst->ai32[2] = puSrc->ai32[0] + puSrc->ai32[1];
13355 puDst->ai32[3] = puSrc->ai32[2] + puSrc->ai32[3];
13356 RT_NOREF(pFpuState);
13357}
13358
13359
13360IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13361{
13362 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13363
13364 uDst.ai16[0] = puSrc1->ai16[0] + puSrc1->ai16[1];
13365 uDst.ai16[1] = puSrc1->ai16[2] + puSrc1->ai16[3];
13366 uDst.ai16[2] = puSrc1->ai16[4] + puSrc1->ai16[5];
13367 uDst.ai16[3] = puSrc1->ai16[6] + puSrc1->ai16[7];
13368
13369 uDst.ai16[4] = puSrc2->ai16[0] + puSrc2->ai16[1];
13370 uDst.ai16[5] = puSrc2->ai16[2] + puSrc2->ai16[3];
13371 uDst.ai16[6] = puSrc2->ai16[4] + puSrc2->ai16[5];
13372 uDst.ai16[7] = puSrc2->ai16[6] + puSrc2->ai16[7];
13373
13374 puDst->au64[0] = uDst.au64[0];
13375 puDst->au64[1] = uDst.au64[1];
13376}
13377
13378
13379IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13380{
13381 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13382
13383 uDst.ai16[ 0] = puSrc1->ai16[ 0] + puSrc1->ai16[ 1];
13384 uDst.ai16[ 1] = puSrc1->ai16[ 2] + puSrc1->ai16[ 3];
13385 uDst.ai16[ 2] = puSrc1->ai16[ 4] + puSrc1->ai16[ 5];
13386 uDst.ai16[ 3] = puSrc1->ai16[ 6] + puSrc1->ai16[ 7];
13387 uDst.ai16[ 4] = puSrc2->ai16[ 0] + puSrc2->ai16[ 1];
13388 uDst.ai16[ 5] = puSrc2->ai16[ 2] + puSrc2->ai16[ 3];
13389 uDst.ai16[ 6] = puSrc2->ai16[ 4] + puSrc2->ai16[ 5];
13390 uDst.ai16[ 7] = puSrc2->ai16[ 6] + puSrc2->ai16[ 7];
13391
13392 uDst.ai16[ 8] = puSrc1->ai16[ 8] + puSrc1->ai16[ 9];
13393 uDst.ai16[ 9] = puSrc1->ai16[10] + puSrc1->ai16[11];
13394 uDst.ai16[10] = puSrc1->ai16[12] + puSrc1->ai16[13];
13395 uDst.ai16[11] = puSrc1->ai16[14] + puSrc1->ai16[15];
13396 uDst.ai16[12] = puSrc2->ai16[ 8] + puSrc2->ai16[ 9];
13397 uDst.ai16[13] = puSrc2->ai16[10] + puSrc2->ai16[11];
13398 uDst.ai16[14] = puSrc2->ai16[12] + puSrc2->ai16[13];
13399 uDst.ai16[15] = puSrc2->ai16[14] + puSrc2->ai16[15];
13400
13401 puDst->au64[0] = uDst.au64[0];
13402 puDst->au64[1] = uDst.au64[1];
13403 puDst->au64[2] = uDst.au64[2];
13404 puDst->au64[3] = uDst.au64[3];
13405}
13406
13407
13408IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13409{
13410 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13411
13412 uDst.ai32[0] = puSrc1->ai32[0] + puSrc1->ai32[1];
13413 uDst.ai32[1] = puSrc1->ai32[2] + puSrc1->ai32[3];
13414
13415 uDst.ai32[2] = puSrc2->ai32[0] + puSrc2->ai32[1];
13416 uDst.ai32[3] = puSrc2->ai32[2] + puSrc2->ai32[3];
13417
13418 puDst->au64[0] = uDst.au64[0];
13419 puDst->au64[1] = uDst.au64[1];
13420}
13421
13422
13423IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13424{
13425 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13426
13427 uDst.ai32[0] = puSrc1->ai32[ 0] + puSrc1->ai32[ 1];
13428 uDst.ai32[1] = puSrc1->ai32[ 2] + puSrc1->ai32[ 3];
13429 uDst.ai32[2] = puSrc2->ai32[ 0] + puSrc2->ai32[ 1];
13430 uDst.ai32[3] = puSrc2->ai32[ 2] + puSrc2->ai32[ 3];
13431
13432 uDst.ai32[4] = puSrc1->ai32[ 4] + puSrc1->ai32[ 5];
13433 uDst.ai32[5] = puSrc1->ai32[ 6] + puSrc1->ai32[ 7];
13434 uDst.ai32[6] = puSrc2->ai32[ 4] + puSrc2->ai32[ 5];
13435 uDst.ai32[7] = puSrc2->ai32[ 6] + puSrc2->ai32[ 7];
13436
13437 puDst->au64[0] = uDst.au64[0];
13438 puDst->au64[1] = uDst.au64[1];
13439 puDst->au64[2] = uDst.au64[2];
13440 puDst->au64[3] = uDst.au64[3];
13441}
13442
13443
13444/*
13445 * PHSUBW / VPHSUBW / PHSUBD / VPHSUBD
13446 */
13447IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13448{
13449 RTUINT64U uSrc1 = { *puDst };
13450 RTUINT64U uSrc2 = { *puSrc };
13451 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13452
13453 uDst.ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13454 uDst.ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13455 uDst.ai16[2] = uSrc2.ai16[0] - uSrc2.ai16[1];
13456 uDst.ai16[3] = uSrc2.ai16[2] - uSrc2.ai16[3];
13457 *puDst = uDst.u;
13458 RT_NOREF(pFpuState);
13459}
13460
13461
13462IEM_DECL_IMPL_DEF(void, iemAImpl_phsubw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13463{
13464 RTUINT128U uSrc1 = *puDst;
13465
13466 puDst->ai16[0] = uSrc1.ai16[0] - uSrc1.ai16[1];
13467 puDst->ai16[1] = uSrc1.ai16[2] - uSrc1.ai16[3];
13468 puDst->ai16[2] = uSrc1.ai16[4] - uSrc1.ai16[5];
13469 puDst->ai16[3] = uSrc1.ai16[6] - uSrc1.ai16[7];
13470
13471 puDst->ai16[4] = puSrc->ai16[0] - puSrc->ai16[1];
13472 puDst->ai16[5] = puSrc->ai16[2] - puSrc->ai16[3];
13473 puDst->ai16[6] = puSrc->ai16[4] - puSrc->ai16[5];
13474 puDst->ai16[7] = puSrc->ai16[6] - puSrc->ai16[7];
13475 RT_NOREF(pFpuState);
13476}
13477
13478
13479IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13480{
13481 RTUINT64U uSrc1 = { *puDst };
13482 RTUINT64U uSrc2 = { *puSrc };
13483 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13484
13485 uDst.ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13486 uDst.ai32[1] = uSrc2.ai32[0] - uSrc2.ai32[1];
13487 *puDst = uDst.u;
13488 RT_NOREF(pFpuState);
13489}
13490
13491
13492IEM_DECL_IMPL_DEF(void, iemAImpl_phsubd_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13493{
13494 RTUINT128U uSrc1 = *puDst;
13495
13496 puDst->ai32[0] = uSrc1.ai32[0] - uSrc1.ai32[1];
13497 puDst->ai32[1] = uSrc1.ai32[2] - uSrc1.ai32[3];
13498
13499 puDst->ai32[2] = puSrc->ai32[0] - puSrc->ai32[1];
13500 puDst->ai32[3] = puSrc->ai32[2] - puSrc->ai32[3];
13501 RT_NOREF(pFpuState);
13502}
13503
13504
13505IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13506{
13507 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13508
13509 uDst.ai16[0] = puSrc1->ai16[0] - puSrc1->ai16[1];
13510 uDst.ai16[1] = puSrc1->ai16[2] - puSrc1->ai16[3];
13511 uDst.ai16[2] = puSrc1->ai16[4] - puSrc1->ai16[5];
13512 uDst.ai16[3] = puSrc1->ai16[6] - puSrc1->ai16[7];
13513
13514 uDst.ai16[4] = puSrc2->ai16[0] - puSrc2->ai16[1];
13515 uDst.ai16[5] = puSrc2->ai16[2] - puSrc2->ai16[3];
13516 uDst.ai16[6] = puSrc2->ai16[4] - puSrc2->ai16[5];
13517 uDst.ai16[7] = puSrc2->ai16[6] - puSrc2->ai16[7];
13518
13519 puDst->au64[0] = uDst.au64[0];
13520 puDst->au64[1] = uDst.au64[1];
13521}
13522
13523
13524IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13525{
13526 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13527
13528 uDst.ai16[ 0] = puSrc1->ai16[ 0] - puSrc1->ai16[ 1];
13529 uDst.ai16[ 1] = puSrc1->ai16[ 2] - puSrc1->ai16[ 3];
13530 uDst.ai16[ 2] = puSrc1->ai16[ 4] - puSrc1->ai16[ 5];
13531 uDst.ai16[ 3] = puSrc1->ai16[ 6] - puSrc1->ai16[ 7];
13532 uDst.ai16[ 4] = puSrc2->ai16[ 0] - puSrc2->ai16[ 1];
13533 uDst.ai16[ 5] = puSrc2->ai16[ 2] - puSrc2->ai16[ 3];
13534 uDst.ai16[ 6] = puSrc2->ai16[ 4] - puSrc2->ai16[ 5];
13535 uDst.ai16[ 7] = puSrc2->ai16[ 6] - puSrc2->ai16[ 7];
13536
13537 uDst.ai16[ 8] = puSrc1->ai16[ 8] - puSrc1->ai16[ 9];
13538 uDst.ai16[ 9] = puSrc1->ai16[10] - puSrc1->ai16[11];
13539 uDst.ai16[10] = puSrc1->ai16[12] - puSrc1->ai16[13];
13540 uDst.ai16[11] = puSrc1->ai16[14] - puSrc1->ai16[15];
13541 uDst.ai16[12] = puSrc2->ai16[ 8] - puSrc2->ai16[ 9];
13542 uDst.ai16[13] = puSrc2->ai16[10] - puSrc2->ai16[11];
13543 uDst.ai16[14] = puSrc2->ai16[12] - puSrc2->ai16[13];
13544 uDst.ai16[15] = puSrc2->ai16[14] - puSrc2->ai16[15];
13545
13546 puDst->au64[0] = uDst.au64[0];
13547 puDst->au64[1] = uDst.au64[1];
13548 puDst->au64[2] = uDst.au64[2];
13549 puDst->au64[3] = uDst.au64[3];
13550}
13551
13552
13553IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13554{
13555 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13556
13557 uDst.ai32[0] = puSrc1->ai32[0] - puSrc1->ai32[1];
13558 uDst.ai32[1] = puSrc1->ai32[2] - puSrc1->ai32[3];
13559
13560 uDst.ai32[2] = puSrc2->ai32[0] - puSrc2->ai32[1];
13561 uDst.ai32[3] = puSrc2->ai32[2] - puSrc2->ai32[3];
13562
13563 puDst->au64[0] = uDst.au64[0];
13564 puDst->au64[1] = uDst.au64[1];
13565}
13566
13567
13568IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13569{
13570 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13571
13572 uDst.ai32[0] = puSrc1->ai32[ 0] - puSrc1->ai32[ 1];
13573 uDst.ai32[1] = puSrc1->ai32[ 2] - puSrc1->ai32[ 3];
13574 uDst.ai32[2] = puSrc2->ai32[ 0] - puSrc2->ai32[ 1];
13575 uDst.ai32[3] = puSrc2->ai32[ 2] - puSrc2->ai32[ 3];
13576
13577 uDst.ai32[4] = puSrc1->ai32[ 4] - puSrc1->ai32[ 5];
13578 uDst.ai32[5] = puSrc1->ai32[ 6] - puSrc1->ai32[ 7];
13579 uDst.ai32[6] = puSrc2->ai32[ 4] - puSrc2->ai32[ 5];
13580 uDst.ai32[7] = puSrc2->ai32[ 6] - puSrc2->ai32[ 7];
13581
13582 puDst->au64[0] = uDst.au64[0];
13583 puDst->au64[1] = uDst.au64[1];
13584 puDst->au64[2] = uDst.au64[2];
13585 puDst->au64[3] = uDst.au64[3];
13586}
13587
13588
13589/*
13590 * PHADDSW / VPHADDSW
13591 */
13592IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13593{
13594 RTUINT64U uSrc1 = { *puDst };
13595 RTUINT64U uSrc2 = { *puSrc };
13596 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13597
13598 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13599 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13600 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] + uSrc2.ai16[1]);
13601 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] + uSrc2.ai16[3]);
13602 *puDst = uDst.u;
13603 RT_NOREF(pFpuState);
13604}
13605
13606
13607IEM_DECL_IMPL_DEF(void, iemAImpl_phaddsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13608{
13609 RTUINT128U uSrc1 = *puDst;
13610
13611 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] + uSrc1.ai16[1]);
13612 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] + uSrc1.ai16[3]);
13613 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] + uSrc1.ai16[5]);
13614 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] + uSrc1.ai16[7]);
13615
13616 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] + puSrc->ai16[1]);
13617 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] + puSrc->ai16[3]);
13618 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] + puSrc->ai16[5]);
13619 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] + puSrc->ai16[7]);
13620 RT_NOREF(pFpuState);
13621}
13622
13623
13624IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13625{
13626 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13627
13628 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] + puSrc1->ai16[1]);
13629 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] + puSrc1->ai16[3]);
13630 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] + puSrc1->ai16[5]);
13631 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] + puSrc1->ai16[7]);
13632
13633 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] + puSrc2->ai16[1]);
13634 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] + puSrc2->ai16[3]);
13635 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] + puSrc2->ai16[5]);
13636 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] + puSrc2->ai16[7]);
13637
13638 puDst->au64[0] = uDst.au64[0];
13639 puDst->au64[1] = uDst.au64[1];
13640}
13641
13642
13643IEM_DECL_IMPL_DEF(void, iemAImpl_vphaddsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13644{
13645 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13646
13647 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] + puSrc1->ai16[ 1]);
13648 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] + puSrc1->ai16[ 3]);
13649 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] + puSrc1->ai16[ 5]);
13650 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] + puSrc1->ai16[ 7]);
13651 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] + puSrc2->ai16[ 1]);
13652 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] + puSrc2->ai16[ 3]);
13653 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] + puSrc2->ai16[ 5]);
13654 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] + puSrc2->ai16[ 7]);
13655
13656 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] + puSrc1->ai16[ 9]);
13657 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] + puSrc1->ai16[11]);
13658 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] + puSrc1->ai16[13]);
13659 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] + puSrc1->ai16[15]);
13660 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] + puSrc2->ai16[ 9]);
13661 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] + puSrc2->ai16[11]);
13662 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] + puSrc2->ai16[13]);
13663 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] + puSrc2->ai16[15]);
13664
13665 puDst->au64[0] = uDst.au64[0];
13666 puDst->au64[1] = uDst.au64[1];
13667 puDst->au64[2] = uDst.au64[2];
13668 puDst->au64[3] = uDst.au64[3];
13669}
13670
13671
13672/*
13673 * PHSUBSW / VPHSUBSW
13674 */
13675IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13676{
13677 RTUINT64U uSrc1 = { *puDst };
13678 RTUINT64U uSrc2 = { *puSrc };
13679 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13680
13681 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13682 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13683 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[0] - uSrc2.ai16[1]);
13684 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc2.ai16[2] - uSrc2.ai16[3]);
13685 *puDst = uDst.u;
13686 RT_NOREF(pFpuState);
13687}
13688
13689
13690IEM_DECL_IMPL_DEF(void, iemAImpl_phsubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13691{
13692 RTUINT128U uSrc1 = *puDst;
13693
13694 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[0] - uSrc1.ai16[1]);
13695 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[2] - uSrc1.ai16[3]);
13696 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[4] - uSrc1.ai16[5]);
13697 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(uSrc1.ai16[6] - uSrc1.ai16[7]);
13698
13699 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[0] - puSrc->ai16[1]);
13700 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[2] - puSrc->ai16[3]);
13701 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[4] - puSrc->ai16[5]);
13702 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc->ai16[6] - puSrc->ai16[7]);
13703 RT_NOREF(pFpuState);
13704}
13705
13706
13707IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13708{
13709 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13710
13711 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[0] - puSrc1->ai16[1]);
13712 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[2] - puSrc1->ai16[3]);
13713 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[4] - puSrc1->ai16[5]);
13714 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[6] - puSrc1->ai16[7]);
13715
13716 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[0] - puSrc2->ai16[1]);
13717 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[2] - puSrc2->ai16[3]);
13718 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[4] - puSrc2->ai16[5]);
13719 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[6] - puSrc2->ai16[7]);
13720
13721 puDst->au64[0] = uDst.au64[0];
13722 puDst->au64[1] = uDst.au64[1];
13723}
13724
13725
13726IEM_DECL_IMPL_DEF(void, iemAImpl_vphsubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13727{
13728 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13729
13730 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 0] - puSrc1->ai16[ 1]);
13731 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 2] - puSrc1->ai16[ 3]);
13732 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 4] - puSrc1->ai16[ 5]);
13733 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 6] - puSrc1->ai16[ 7]);
13734 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 0] - puSrc2->ai16[ 1]);
13735 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 2] - puSrc2->ai16[ 3]);
13736 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 4] - puSrc2->ai16[ 5]);
13737 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 6] - puSrc2->ai16[ 7]);
13738
13739 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[ 8] - puSrc1->ai16[ 9]);
13740 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[10] - puSrc1->ai16[11]);
13741 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[12] - puSrc1->ai16[13]);
13742 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc1->ai16[14] - puSrc1->ai16[15]);
13743 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[ 8] - puSrc2->ai16[ 9]);
13744 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[10] - puSrc2->ai16[11]);
13745 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[12] - puSrc2->ai16[13]);
13746 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD(puSrc2->ai16[14] - puSrc2->ai16[15]);
13747
13748 puDst->au64[0] = uDst.au64[0];
13749 puDst->au64[1] = uDst.au64[1];
13750 puDst->au64[2] = uDst.au64[2];
13751 puDst->au64[3] = uDst.au64[3];
13752}
13753
13754
13755/*
13756 * PMADDUBSW / VPMADDUBSW
13757 */
13758IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13759{
13760 RTUINT64U uSrc1 = { *puDst };
13761 RTUINT64U uSrc2 = { *puSrc };
13762 RTUINT64U uDst = { 0 }; /* Shut up MSVC. */
13763
13764 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[0] * uSrc2.ai8[0] + (uint16_t)uSrc1.au8[1] * uSrc2.ai8[1]);
13765 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[2] * uSrc2.ai8[2] + (uint16_t)uSrc1.au8[3] * uSrc2.ai8[3]);
13766 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[4] * uSrc2.ai8[4] + (uint16_t)uSrc1.au8[5] * uSrc2.ai8[5]);
13767 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[6] * uSrc2.ai8[6] + (uint16_t)uSrc1.au8[7] * uSrc2.ai8[7]);
13768 *puDst = uDst.u;
13769 RT_NOREF(pFpuState);
13770}
13771
13772
13773IEM_DECL_IMPL_DEF(void, iemAImpl_pmaddubsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13774{
13775 RTUINT128U uSrc1 = *puDst;
13776
13777 puDst->ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 0] * puSrc->ai8[ 0] + (uint16_t)uSrc1.au8[ 1] * puSrc->ai8[ 1]);
13778 puDst->ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 2] * puSrc->ai8[ 2] + (uint16_t)uSrc1.au8[ 3] * puSrc->ai8[ 3]);
13779 puDst->ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 4] * puSrc->ai8[ 4] + (uint16_t)uSrc1.au8[ 5] * puSrc->ai8[ 5]);
13780 puDst->ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 6] * puSrc->ai8[ 6] + (uint16_t)uSrc1.au8[ 7] * puSrc->ai8[ 7]);
13781 puDst->ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[ 8] * puSrc->ai8[ 8] + (uint16_t)uSrc1.au8[ 9] * puSrc->ai8[ 9]);
13782 puDst->ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[10] * puSrc->ai8[10] + (uint16_t)uSrc1.au8[11] * puSrc->ai8[11]);
13783 puDst->ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[12] * puSrc->ai8[12] + (uint16_t)uSrc1.au8[13] * puSrc->ai8[13]);
13784 puDst->ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)uSrc1.au8[14] * puSrc->ai8[14] + (uint16_t)uSrc1.au8[15] * puSrc->ai8[15]);
13785 RT_NOREF(pFpuState);
13786}
13787
13788
13789IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13790{
13791 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13792
13793 uDst.ai16[0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13794 uDst.ai16[1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13795 uDst.ai16[2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13796 uDst.ai16[3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13797 uDst.ai16[4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13798 uDst.ai16[5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13799 uDst.ai16[6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13800 uDst.ai16[7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13801
13802 puDst->au64[0] = uDst.au64[0];
13803 puDst->au64[1] = uDst.au64[1];
13804}
13805
13806
13807IEM_DECL_IMPL_DEF(void, iemAImpl_vpmaddubsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13808{
13809 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13810
13811 uDst.ai16[ 0] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 0] * puSrc2->ai8[ 0] + (uint16_t)puSrc1->au8[ 1] * puSrc2->ai8[ 1]);
13812 uDst.ai16[ 1] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 2] * puSrc2->ai8[ 2] + (uint16_t)puSrc1->au8[ 3] * puSrc2->ai8[ 3]);
13813 uDst.ai16[ 2] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 4] * puSrc2->ai8[ 4] + (uint16_t)puSrc1->au8[ 5] * puSrc2->ai8[ 5]);
13814 uDst.ai16[ 3] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 6] * puSrc2->ai8[ 6] + (uint16_t)puSrc1->au8[ 7] * puSrc2->ai8[ 7]);
13815 uDst.ai16[ 4] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[ 8] * puSrc2->ai8[ 8] + (uint16_t)puSrc1->au8[ 9] * puSrc2->ai8[ 9]);
13816 uDst.ai16[ 5] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[10] * puSrc2->ai8[10] + (uint16_t)puSrc1->au8[11] * puSrc2->ai8[11]);
13817 uDst.ai16[ 6] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[12] * puSrc2->ai8[12] + (uint16_t)puSrc1->au8[13] * puSrc2->ai8[13]);
13818 uDst.ai16[ 7] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[14] * puSrc2->ai8[14] + (uint16_t)puSrc1->au8[15] * puSrc2->ai8[15]);
13819 uDst.ai16[ 8] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[16] * puSrc2->ai8[16] + (uint16_t)puSrc1->au8[17] * puSrc2->ai8[17]);
13820 uDst.ai16[ 9] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[18] * puSrc2->ai8[18] + (uint16_t)puSrc1->au8[19] * puSrc2->ai8[19]);
13821 uDst.ai16[10] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[20] * puSrc2->ai8[20] + (uint16_t)puSrc1->au8[21] * puSrc2->ai8[21]);
13822 uDst.ai16[11] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[22] * puSrc2->ai8[22] + (uint16_t)puSrc1->au8[23] * puSrc2->ai8[23]);
13823 uDst.ai16[12] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[24] * puSrc2->ai8[24] + (uint16_t)puSrc1->au8[25] * puSrc2->ai8[25]);
13824 uDst.ai16[13] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[26] * puSrc2->ai8[26] + (uint16_t)puSrc1->au8[27] * puSrc2->ai8[27]);
13825 uDst.ai16[14] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[28] * puSrc2->ai8[28] + (uint16_t)puSrc1->au8[29] * puSrc2->ai8[29]);
13826 uDst.ai16[15] = SATURATED_SIGNED_DWORD_TO_SIGNED_WORD((uint16_t)puSrc1->au8[30] * puSrc2->ai8[30] + (uint16_t)puSrc1->au8[31] * puSrc2->ai8[31]);
13827
13828 puDst->au64[0] = uDst.au64[0];
13829 puDst->au64[1] = uDst.au64[1];
13830 puDst->au64[2] = uDst.au64[2];
13831 puDst->au64[3] = uDst.au64[3];
13832}
13833
13834
13835/*
13836 * PMULHRSW / VPMULHRSW
13837 */
13838#define DO_PMULHRSW(a_Src1, a_Src2) \
13839 (uint16_t)(((((int32_t)(a_Src1) * (a_Src2)) >> 14 ) + 1) >> 1)
13840
13841IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u64_fallback,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
13842{
13843 RTUINT64U uSrc1 = { *puDst };
13844 RTUINT64U uSrc2 = { *puSrc };
13845 RTUINT64U uDst;
13846
13847 uDst.au16[0] = DO_PMULHRSW(uSrc1.ai16[0], uSrc2.ai16[0]);
13848 uDst.au16[1] = DO_PMULHRSW(uSrc1.ai16[1], uSrc2.ai16[1]);
13849 uDst.au16[2] = DO_PMULHRSW(uSrc1.ai16[2], uSrc2.ai16[2]);
13850 uDst.au16[3] = DO_PMULHRSW(uSrc1.ai16[3], uSrc2.ai16[3]);
13851 *puDst = uDst.u;
13852 RT_NOREF(pFpuState);
13853}
13854
13855
13856IEM_DECL_IMPL_DEF(void, iemAImpl_pmulhrsw_u128_fallback,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
13857{
13858 RTUINT128U uSrc1 = *puDst;
13859
13860 puDst->ai16[0] = DO_PMULHRSW(uSrc1.ai16[0], puSrc->ai16[0]);
13861 puDst->ai16[1] = DO_PMULHRSW(uSrc1.ai16[1], puSrc->ai16[1]);
13862 puDst->ai16[2] = DO_PMULHRSW(uSrc1.ai16[2], puSrc->ai16[2]);
13863 puDst->ai16[3] = DO_PMULHRSW(uSrc1.ai16[3], puSrc->ai16[3]);
13864 puDst->ai16[4] = DO_PMULHRSW(uSrc1.ai16[4], puSrc->ai16[4]);
13865 puDst->ai16[5] = DO_PMULHRSW(uSrc1.ai16[5], puSrc->ai16[5]);
13866 puDst->ai16[6] = DO_PMULHRSW(uSrc1.ai16[6], puSrc->ai16[6]);
13867 puDst->ai16[7] = DO_PMULHRSW(uSrc1.ai16[7], puSrc->ai16[7]);
13868 RT_NOREF(pFpuState);
13869}
13870
13871
13872IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13873{
13874 RTUINT128U uDst; /* puDst can be the same as one of the source operands. */
13875
13876 uDst.ai16[0] = DO_PMULHRSW(puSrc1->ai16[0], puSrc2->ai16[0]);
13877 uDst.ai16[1] = DO_PMULHRSW(puSrc1->ai16[1], puSrc2->ai16[1]);
13878 uDst.ai16[2] = DO_PMULHRSW(puSrc1->ai16[2], puSrc2->ai16[2]);
13879 uDst.ai16[3] = DO_PMULHRSW(puSrc1->ai16[3], puSrc2->ai16[3]);
13880 uDst.ai16[4] = DO_PMULHRSW(puSrc1->ai16[4], puSrc2->ai16[4]);
13881 uDst.ai16[5] = DO_PMULHRSW(puSrc1->ai16[5], puSrc2->ai16[5]);
13882 uDst.ai16[6] = DO_PMULHRSW(puSrc1->ai16[6], puSrc2->ai16[6]);
13883 uDst.ai16[7] = DO_PMULHRSW(puSrc1->ai16[7], puSrc2->ai16[7]);
13884
13885 puDst->au64[0] = uDst.au64[0];
13886 puDst->au64[1] = uDst.au64[1];
13887}
13888
13889
13890IEM_DECL_IMPL_DEF(void, iemAImpl_vpmulhrsw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
13891{
13892 RTUINT256U uDst; /* puDst can be the same as one of the source operands. */
13893
13894 uDst.ai16[ 0] = DO_PMULHRSW(puSrc1->ai16[ 0], puSrc2->ai16[ 0]);
13895 uDst.ai16[ 1] = DO_PMULHRSW(puSrc1->ai16[ 1], puSrc2->ai16[ 1]);
13896 uDst.ai16[ 2] = DO_PMULHRSW(puSrc1->ai16[ 2], puSrc2->ai16[ 2]);
13897 uDst.ai16[ 3] = DO_PMULHRSW(puSrc1->ai16[ 3], puSrc2->ai16[ 3]);
13898 uDst.ai16[ 4] = DO_PMULHRSW(puSrc1->ai16[ 4], puSrc2->ai16[ 4]);
13899 uDst.ai16[ 5] = DO_PMULHRSW(puSrc1->ai16[ 5], puSrc2->ai16[ 5]);
13900 uDst.ai16[ 6] = DO_PMULHRSW(puSrc1->ai16[ 6], puSrc2->ai16[ 6]);
13901 uDst.ai16[ 7] = DO_PMULHRSW(puSrc1->ai16[ 7], puSrc2->ai16[ 7]);
13902 uDst.ai16[ 8] = DO_PMULHRSW(puSrc1->ai16[ 8], puSrc2->ai16[ 8]);
13903 uDst.ai16[ 9] = DO_PMULHRSW(puSrc1->ai16[ 9], puSrc2->ai16[ 9]);
13904 uDst.ai16[10] = DO_PMULHRSW(puSrc1->ai16[10], puSrc2->ai16[10]);
13905 uDst.ai16[11] = DO_PMULHRSW(puSrc1->ai16[11], puSrc2->ai16[11]);
13906 uDst.ai16[12] = DO_PMULHRSW(puSrc1->ai16[12], puSrc2->ai16[12]);
13907 uDst.ai16[13] = DO_PMULHRSW(puSrc1->ai16[13], puSrc2->ai16[13]);
13908 uDst.ai16[14] = DO_PMULHRSW(puSrc1->ai16[14], puSrc2->ai16[14]);
13909 uDst.ai16[15] = DO_PMULHRSW(puSrc1->ai16[15], puSrc2->ai16[15]);
13910
13911 puDst->au64[0] = uDst.au64[0];
13912 puDst->au64[1] = uDst.au64[1];
13913 puDst->au64[2] = uDst.au64[2];
13914 puDst->au64[3] = uDst.au64[3];
13915}
13916
13917
13918/*
13919 * PSADBW / VPSADBW
13920 */
13921#ifdef IEM_WITHOUT_ASSEMBLY
13922
13923IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u64,(uint64_t *puDst, uint64_t const *puSrc))
13924{
13925 RTUINT64U uSrc1 = { *puDst };
13926 RTUINT64U uSrc2 = { *puSrc };
13927 RTUINT64U uDst;
13928 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
13929 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13930 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13931 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13932 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13933 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13934 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13935 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13936
13937 uDst.au64[0] = 0;
13938 uDst.au16[0] = uSum;
13939 *puDst = uDst.u;
13940}
13941
13942
13943IEM_DECL_IMPL_DEF(void, iemAImpl_psadbw_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
13944{
13945 RTUINT128U uSrc1 = *puDst;
13946
13947 puDst->au64[0] = 0;
13948 puDst->au64[1] = 0;
13949
13950 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - puSrc->ai8[0]);
13951 uSum += RT_ABS((int16_t)uSrc1.au8[1] - puSrc->au8[1]);
13952 uSum += RT_ABS((int16_t)uSrc1.au8[2] - puSrc->au8[2]);
13953 uSum += RT_ABS((int16_t)uSrc1.au8[3] - puSrc->au8[3]);
13954 uSum += RT_ABS((int16_t)uSrc1.au8[4] - puSrc->au8[4]);
13955 uSum += RT_ABS((int16_t)uSrc1.au8[5] - puSrc->au8[5]);
13956 uSum += RT_ABS((int16_t)uSrc1.au8[6] - puSrc->au8[6]);
13957 uSum += RT_ABS((int16_t)uSrc1.au8[7] - puSrc->au8[7]);
13958 puDst->au16[0] = uSum;
13959
13960 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - puSrc->au8[ 8]);
13961 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - puSrc->au8[ 9]);
13962 uSum += RT_ABS((int16_t)uSrc1.au8[10] - puSrc->au8[10]);
13963 uSum += RT_ABS((int16_t)uSrc1.au8[11] - puSrc->au8[11]);
13964 uSum += RT_ABS((int16_t)uSrc1.au8[12] - puSrc->au8[12]);
13965 uSum += RT_ABS((int16_t)uSrc1.au8[13] - puSrc->au8[13]);
13966 uSum += RT_ABS((int16_t)uSrc1.au8[14] - puSrc->au8[14]);
13967 uSum += RT_ABS((int16_t)uSrc1.au8[15] - puSrc->au8[15]);
13968 puDst->au16[4] = uSum;
13969}
13970
13971#endif
13972
13973IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
13974{
13975 RTUINT128U uSrc1 = *puSrc1;
13976 RTUINT128U uSrc2 = *puSrc2;
13977
13978 puDst->au64[0] = 0;
13979 puDst->au64[1] = 0;
13980
13981 uint16_t uSum = RT_ABS((int16_t)uSrc1.ai8[0] - uSrc2.ai8[0]);
13982 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
13983 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
13984 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
13985 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
13986 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
13987 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
13988 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
13989 puDst->au16[0] = uSum;
13990
13991 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
13992 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
13993 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
13994 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
13995 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
13996 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
13997 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
13998 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
13999 puDst->au16[4] = uSum;
14000}
14001
14002IEM_DECL_IMPL_DEF(void, iemAImpl_vpsadbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14003{
14004 RTUINT256U uSrc1 = *puSrc1;
14005 RTUINT256U uSrc2 = *puSrc2;
14006
14007 puDst->au64[0] = 0;
14008 puDst->au64[1] = 0;
14009 puDst->au64[2] = 0;
14010 puDst->au64[3] = 0;
14011
14012 uint16_t uSum = RT_ABS((int16_t)uSrc1.au8[0] - uSrc2.au8[0]);
14013 uSum += RT_ABS((int16_t)uSrc1.au8[1] - uSrc2.au8[1]);
14014 uSum += RT_ABS((int16_t)uSrc1.au8[2] - uSrc2.au8[2]);
14015 uSum += RT_ABS((int16_t)uSrc1.au8[3] - uSrc2.au8[3]);
14016 uSum += RT_ABS((int16_t)uSrc1.au8[4] - uSrc2.au8[4]);
14017 uSum += RT_ABS((int16_t)uSrc1.au8[5] - uSrc2.au8[5]);
14018 uSum += RT_ABS((int16_t)uSrc1.au8[6] - uSrc2.au8[6]);
14019 uSum += RT_ABS((int16_t)uSrc1.au8[7] - uSrc2.au8[7]);
14020 puDst->au16[0] = uSum;
14021
14022 uSum = RT_ABS((int16_t)uSrc1.au8[ 8] - uSrc2.au8[ 8]);
14023 uSum += RT_ABS((int16_t)uSrc1.au8[ 9] - uSrc2.au8[ 9]);
14024 uSum += RT_ABS((int16_t)uSrc1.au8[10] - uSrc2.au8[10]);
14025 uSum += RT_ABS((int16_t)uSrc1.au8[11] - uSrc2.au8[11]);
14026 uSum += RT_ABS((int16_t)uSrc1.au8[12] - uSrc2.au8[12]);
14027 uSum += RT_ABS((int16_t)uSrc1.au8[13] - uSrc2.au8[13]);
14028 uSum += RT_ABS((int16_t)uSrc1.au8[14] - uSrc2.au8[14]);
14029 uSum += RT_ABS((int16_t)uSrc1.au8[15] - uSrc2.au8[15]);
14030 puDst->au16[4] = uSum;
14031
14032 uSum = RT_ABS((int16_t)uSrc1.au8[16] - uSrc2.au8[16]);
14033 uSum += RT_ABS((int16_t)uSrc1.au8[17] - uSrc2.au8[17]);
14034 uSum += RT_ABS((int16_t)uSrc1.au8[18] - uSrc2.au8[18]);
14035 uSum += RT_ABS((int16_t)uSrc1.au8[19] - uSrc2.au8[19]);
14036 uSum += RT_ABS((int16_t)uSrc1.au8[20] - uSrc2.au8[20]);
14037 uSum += RT_ABS((int16_t)uSrc1.au8[21] - uSrc2.au8[21]);
14038 uSum += RT_ABS((int16_t)uSrc1.au8[22] - uSrc2.au8[22]);
14039 uSum += RT_ABS((int16_t)uSrc1.au8[23] - uSrc2.au8[23]);
14040 puDst->au16[8] = uSum;
14041
14042 uSum = RT_ABS((int16_t)uSrc1.au8[24] - uSrc2.au8[24]);
14043 uSum += RT_ABS((int16_t)uSrc1.au8[25] - uSrc2.au8[25]);
14044 uSum += RT_ABS((int16_t)uSrc1.au8[26] - uSrc2.au8[26]);
14045 uSum += RT_ABS((int16_t)uSrc1.au8[27] - uSrc2.au8[27]);
14046 uSum += RT_ABS((int16_t)uSrc1.au8[28] - uSrc2.au8[28]);
14047 uSum += RT_ABS((int16_t)uSrc1.au8[29] - uSrc2.au8[29]);
14048 uSum += RT_ABS((int16_t)uSrc1.au8[30] - uSrc2.au8[30]);
14049 uSum += RT_ABS((int16_t)uSrc1.au8[31] - uSrc2.au8[31]);
14050 puDst->au16[12] = uSum;
14051}
14052
14053
14054/*
14055 * PMULDQ / VPMULDQ
14056 */
14057IEM_DECL_IMPL_DEF(void, iemAImpl_pmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14058{
14059 RTUINT128U uSrc1 = *puDst;
14060
14061 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * puSrc->ai32[0];
14062 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * puSrc->ai32[2];
14063}
14064
14065IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14066{
14067 RTUINT128U uSrc1 = *puSrc1;
14068 RTUINT128U uSrc2 = *puSrc2;
14069
14070 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14071 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14072}
14073
14074IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuldq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14075{
14076 RTUINT256U uSrc1 = *puSrc1;
14077 RTUINT256U uSrc2 = *puSrc2;
14078
14079 puDst->au64[0] = (int64_t)uSrc1.ai32[0] * uSrc2.ai32[0];
14080 puDst->au64[1] = (int64_t)uSrc1.ai32[2] * uSrc2.ai32[2];
14081 puDst->au64[2] = (int64_t)uSrc1.ai32[4] * uSrc2.ai32[4];
14082 puDst->au64[3] = (int64_t)uSrc1.ai32[6] * uSrc2.ai32[6];
14083}
14084
14085
14086/*
14087 * PMULUDQ / VPMULUDQ
14088 */
14089#ifdef IEM_WITHOUT_ASSEMBLY
14090
14091IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u64,(PCX86FXSTATE pFpuState, uint64_t *puDst, uint64_t const *puSrc))
14092{
14093 RTUINT64U uSrc1 = { *puDst };
14094 RTUINT64U uSrc2 = { *puSrc };
14095 ASMCompilerBarrier();
14096 *puDst = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14097 RT_NOREF(pFpuState);
14098}
14099
14100
14101IEM_DECL_IMPL_DEF(void, iemAImpl_pmuludq_u128,(PCX86FXSTATE pFpuState, PRTUINT128U puDst, PCRTUINT128U puSrc))
14102{
14103 RTUINT128U uSrc1 = *puDst;
14104 RTUINT128U uSrc2 = *puSrc;
14105 ASMCompilerBarrier();
14106 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14107 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14108 RT_NOREF(pFpuState);
14109}
14110
14111#endif
14112
14113IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14114{
14115 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14116 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14117 ASMCompilerBarrier();
14118 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14119 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14120}
14121
14122
14123IEM_DECL_IMPL_DEF(void, iemAImpl_vpmuludq_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14124{
14125 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14126 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14127 ASMCompilerBarrier();
14128 puDst->au64[0] = (uint64_t)uSrc1.au32[0] * uSrc2.au32[0];
14129 puDst->au64[1] = (uint64_t)uSrc1.au32[2] * uSrc2.au32[2];
14130 puDst->au64[2] = (uint64_t)uSrc1.au32[4] * uSrc2.au32[4];
14131 puDst->au64[3] = (uint64_t)uSrc1.au32[6] * uSrc2.au32[6];
14132}
14133
14134
14135/*
14136 * UNPCKLPS / VUNPCKLPS
14137 */
14138#ifdef IEM_WITHOUT_ASSEMBLY
14139IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14140{
14141 RTUINT128U uSrc1 = *puDst;
14142 RTUINT128U uSrc2 = *puSrc;
14143 ASMCompilerBarrier();
14144 puDst->au32[0] = uSrc1.au32[0];
14145 puDst->au32[1] = uSrc2.au32[0];
14146 puDst->au32[2] = uSrc1.au32[1];
14147 puDst->au32[3] = uSrc2.au32[1];
14148}
14149
14150#endif
14151
14152IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14153{
14154 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14155 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14156 ASMCompilerBarrier();
14157 puDst->au32[0] = uSrc1.au32[0];
14158 puDst->au32[1] = uSrc2.au32[0];
14159 puDst->au32[2] = uSrc1.au32[1];
14160 puDst->au32[3] = uSrc2.au32[1];
14161}
14162
14163
14164IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14165{
14166 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14167 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14168 ASMCompilerBarrier();
14169 puDst->au32[0] = uSrc1.au32[0];
14170 puDst->au32[1] = uSrc2.au32[0];
14171 puDst->au32[2] = uSrc1.au32[1];
14172 puDst->au32[3] = uSrc2.au32[1];
14173
14174 puDst->au32[4] = uSrc1.au32[4];
14175 puDst->au32[5] = uSrc2.au32[4];
14176 puDst->au32[6] = uSrc1.au32[5];
14177 puDst->au32[7] = uSrc2.au32[5];
14178}
14179
14180
14181/*
14182 * UNPCKLPD / VUNPCKLPD
14183 */
14184#ifdef IEM_WITHOUT_ASSEMBLY
14185IEM_DECL_IMPL_DEF(void, iemAImpl_unpcklpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14186{
14187 RTUINT128U uSrc1 = *puDst;
14188 RTUINT128U uSrc2 = *puSrc;
14189 ASMCompilerBarrier();
14190 puDst->au64[0] = uSrc1.au64[0];
14191 puDst->au64[1] = uSrc2.au64[0];
14192}
14193
14194#endif
14195
14196IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14197{
14198 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14199 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14200 ASMCompilerBarrier();
14201 puDst->au64[0] = uSrc1.au64[0];
14202 puDst->au64[1] = uSrc2.au64[0];
14203}
14204
14205
14206IEM_DECL_IMPL_DEF(void, iemAImpl_vunpcklpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14207{
14208 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14209 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14210 ASMCompilerBarrier();
14211 puDst->au64[0] = uSrc1.au64[0];
14212 puDst->au64[1] = uSrc2.au64[0];
14213 puDst->au64[2] = uSrc1.au64[2];
14214 puDst->au64[3] = uSrc2.au64[2];
14215}
14216
14217
14218/*
14219 * UNPCKHPS / VUNPCKHPS
14220 */
14221#ifdef IEM_WITHOUT_ASSEMBLY
14222IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14223{
14224 RTUINT128U uSrc1 = *puDst;
14225 RTUINT128U uSrc2 = *puSrc;
14226 ASMCompilerBarrier();
14227 puDst->au32[0] = uSrc1.au32[2];
14228 puDst->au32[1] = uSrc2.au32[2];
14229 puDst->au32[2] = uSrc1.au32[3];
14230 puDst->au32[3] = uSrc2.au32[3];
14231}
14232
14233#endif
14234
14235IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14236{
14237 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14238 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14239 ASMCompilerBarrier();
14240 puDst->au32[0] = uSrc1.au32[2];
14241 puDst->au32[1] = uSrc2.au32[2];
14242 puDst->au32[2] = uSrc1.au32[3];
14243 puDst->au32[3] = uSrc2.au32[3];
14244}
14245
14246
14247IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14248{
14249 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14250 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14251 ASMCompilerBarrier();
14252 puDst->au32[0] = uSrc1.au32[2];
14253 puDst->au32[1] = uSrc2.au32[2];
14254 puDst->au32[2] = uSrc1.au32[3];
14255 puDst->au32[3] = uSrc2.au32[3];
14256
14257 puDst->au32[4] = uSrc1.au32[6];
14258 puDst->au32[5] = uSrc2.au32[6];
14259 puDst->au32[6] = uSrc1.au32[7];
14260 puDst->au32[7] = uSrc2.au32[7];
14261}
14262
14263
14264/*
14265 * UNPCKHPD / VUNPCKHPD
14266 */
14267#ifdef IEM_WITHOUT_ASSEMBLY
14268IEM_DECL_IMPL_DEF(void, iemAImpl_unpckhpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc))
14269{
14270 RTUINT128U uSrc1 = *puDst;
14271 RTUINT128U uSrc2 = *puSrc;
14272 ASMCompilerBarrier();
14273 puDst->au64[0] = uSrc1.au64[1];
14274 puDst->au64[1] = uSrc2.au64[1];
14275}
14276
14277#endif
14278
14279IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2))
14280{
14281 RTUINT128U uSrc1 = *puSrc1; /* Could overlap with puDst */
14282 RTUINT128U uSrc2 = *puSrc2; /* Could overlap with puDst */
14283 ASMCompilerBarrier();
14284 puDst->au64[0] = uSrc1.au64[1];
14285 puDst->au64[1] = uSrc2.au64[1];
14286}
14287
14288
14289IEM_DECL_IMPL_DEF(void, iemAImpl_vunpckhpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2))
14290{
14291 RTUINT256U uSrc1 = *puSrc1; /* Could overlap with puDst */
14292 RTUINT256U uSrc2 = *puSrc2; /* Could overlap with puDst */
14293 ASMCompilerBarrier();
14294 puDst->au64[0] = uSrc1.au64[1];
14295 puDst->au64[1] = uSrc2.au64[1];
14296 puDst->au64[2] = uSrc1.au64[3];
14297 puDst->au64[3] = uSrc2.au64[3];
14298}
14299
14300
14301/*
14302 * CRC32 (SEE 4.2).
14303 */
14304
14305IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u8_fallback,(uint32_t *puDst, uint8_t uSrc))
14306{
14307 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14308}
14309
14310
14311IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u16_fallback,(uint32_t *puDst, uint16_t uSrc))
14312{
14313 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14314}
14315
14316IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u32_fallback,(uint32_t *puDst, uint32_t uSrc))
14317{
14318 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14319}
14320
14321IEM_DECL_IMPL_DEF(void, iemAImpl_crc32_u64_fallback,(uint32_t *puDst, uint64_t uSrc))
14322{
14323 *puDst = RTCrc32CProcess(*puDst, &uSrc, sizeof(uSrc));
14324}
14325
14326
14327/*
14328 * PTEST (SSE 4.1) - special as it output only EFLAGS.
14329 */
14330#ifdef IEM_WITHOUT_ASSEMBLY
14331IEM_DECL_IMPL_DEF(void, iemAImpl_ptest_u128,(PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint32_t *pfEFlags))
14332{
14333 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14334 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14335 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14336 fEfl |= X86_EFL_ZF;
14337 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14338 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0)
14339 fEfl |= X86_EFL_CF;
14340 *pfEFlags = fEfl;
14341}
14342#endif
14343
14344IEM_DECL_IMPL_DEF(void, iemAImpl_vptest_u256_fallback,(PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint32_t *pfEFlags))
14345{
14346 uint32_t fEfl = *pfEFlags & ~X86_EFL_STATUS_BITS;
14347 if ( (puSrc1->au64[0] & puSrc2->au64[0]) == 0
14348 && (puSrc1->au64[1] & puSrc2->au64[1]) == 0
14349 && (puSrc1->au64[2] & puSrc2->au64[2]) == 0
14350 && (puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14351 fEfl |= X86_EFL_ZF;
14352 if ( (~puSrc1->au64[0] & puSrc2->au64[0]) == 0
14353 && (~puSrc1->au64[1] & puSrc2->au64[1]) == 0
14354 && (~puSrc1->au64[2] & puSrc2->au64[2]) == 0
14355 && (~puSrc1->au64[3] & puSrc2->au64[3]) == 0)
14356 fEfl |= X86_EFL_CF;
14357 *pfEFlags = fEfl;
14358}
14359
14360
14361/*
14362 * PMOVSXBW / VPMOVSXBW
14363 */
14364IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14365{
14366 RTUINT64U uSrc1 = { uSrc };
14367 puDst->ai16[0] = uSrc1.ai8[0];
14368 puDst->ai16[1] = uSrc1.ai8[1];
14369 puDst->ai16[2] = uSrc1.ai8[2];
14370 puDst->ai16[3] = uSrc1.ai8[3];
14371 puDst->ai16[4] = uSrc1.ai8[4];
14372 puDst->ai16[5] = uSrc1.ai8[5];
14373 puDst->ai16[6] = uSrc1.ai8[6];
14374 puDst->ai16[7] = uSrc1.ai8[7];
14375}
14376
14377
14378IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14379{
14380 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14381 puDst->ai16[ 0] = uSrc1.ai8[ 0];
14382 puDst->ai16[ 1] = uSrc1.ai8[ 1];
14383 puDst->ai16[ 2] = uSrc1.ai8[ 2];
14384 puDst->ai16[ 3] = uSrc1.ai8[ 3];
14385 puDst->ai16[ 4] = uSrc1.ai8[ 4];
14386 puDst->ai16[ 5] = uSrc1.ai8[ 5];
14387 puDst->ai16[ 6] = uSrc1.ai8[ 6];
14388 puDst->ai16[ 7] = uSrc1.ai8[ 7];
14389 puDst->ai16[ 8] = uSrc1.ai8[ 8];
14390 puDst->ai16[ 9] = uSrc1.ai8[ 9];
14391 puDst->ai16[10] = uSrc1.ai8[10];
14392 puDst->ai16[11] = uSrc1.ai8[11];
14393 puDst->ai16[12] = uSrc1.ai8[12];
14394 puDst->ai16[13] = uSrc1.ai8[13];
14395 puDst->ai16[14] = uSrc1.ai8[14];
14396 puDst->ai16[15] = uSrc1.ai8[15];
14397}
14398
14399
14400/*
14401 * PMOVSXBD / VPMOVSXBD
14402 */
14403IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14404{
14405 RTUINT32U uSrc1 = { uSrc };
14406 puDst->ai32[0] = uSrc1.ai8[0];
14407 puDst->ai32[1] = uSrc1.ai8[1];
14408 puDst->ai32[2] = uSrc1.ai8[2];
14409 puDst->ai32[3] = uSrc1.ai8[3];
14410}
14411
14412
14413IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14414{
14415 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14416 puDst->ai32[0] = uSrc1.ai8[0];
14417 puDst->ai32[1] = uSrc1.ai8[1];
14418 puDst->ai32[2] = uSrc1.ai8[2];
14419 puDst->ai32[3] = uSrc1.ai8[3];
14420 puDst->ai32[4] = uSrc1.ai8[4];
14421 puDst->ai32[5] = uSrc1.ai8[5];
14422 puDst->ai32[6] = uSrc1.ai8[6];
14423 puDst->ai32[7] = uSrc1.ai8[7];
14424}
14425
14426
14427/*
14428 * PMOVSXBQ / VPMOVSXBQ
14429 */
14430IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14431{
14432 RTUINT16U uSrc1 = { uSrc };
14433 puDst->ai64[0] = uSrc1.ai8[0];
14434 puDst->ai64[1] = uSrc1.ai8[1];
14435}
14436
14437
14438IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14439{
14440 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14441 puDst->ai64[0] = uSrc1.ai8[0];
14442 puDst->ai64[1] = uSrc1.ai8[1];
14443 puDst->ai64[2] = uSrc1.ai8[2];
14444 puDst->ai64[3] = uSrc1.ai8[3];
14445}
14446
14447
14448/*
14449 * PMOVSXWD / VPMOVSXWD
14450 */
14451IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14452{
14453 RTUINT64U uSrc1 = { uSrc };
14454 puDst->ai32[0] = uSrc1.ai16[0];
14455 puDst->ai32[1] = uSrc1.ai16[1];
14456 puDst->ai32[2] = uSrc1.ai16[2];
14457 puDst->ai32[3] = uSrc1.ai16[3];
14458}
14459
14460
14461IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14462{
14463 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14464 puDst->ai32[0] = uSrc1.ai16[0];
14465 puDst->ai32[1] = uSrc1.ai16[1];
14466 puDst->ai32[2] = uSrc1.ai16[2];
14467 puDst->ai32[3] = uSrc1.ai16[3];
14468 puDst->ai32[4] = uSrc1.ai16[4];
14469 puDst->ai32[5] = uSrc1.ai16[5];
14470 puDst->ai32[6] = uSrc1.ai16[6];
14471 puDst->ai32[7] = uSrc1.ai16[7];
14472}
14473
14474
14475/*
14476 * PMOVSXWQ / VPMOVSXWQ
14477 */
14478IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14479{
14480 RTUINT32U uSrc1 = { uSrc };
14481 puDst->ai64[0] = uSrc1.ai16[0];
14482 puDst->ai64[1] = uSrc1.ai16[1];
14483}
14484
14485
14486IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14487{
14488 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14489 puDst->ai64[0] = uSrc1.ai16[0];
14490 puDst->ai64[1] = uSrc1.ai16[1];
14491 puDst->ai64[2] = uSrc1.ai16[2];
14492 puDst->ai64[3] = uSrc1.ai16[3];
14493}
14494
14495
14496/*
14497 * PMOVSXDQ / VPMOVSXDQ
14498 */
14499IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14500{
14501 RTUINT64U uSrc1 = { uSrc };
14502 puDst->ai64[0] = uSrc1.ai32[0];
14503 puDst->ai64[1] = uSrc1.ai32[1];
14504}
14505
14506
14507IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovsxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14508{
14509 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14510 puDst->ai64[0] = uSrc1.ai32[0];
14511 puDst->ai64[1] = uSrc1.ai32[1];
14512 puDst->ai64[2] = uSrc1.ai32[2];
14513 puDst->ai64[3] = uSrc1.ai32[3];
14514}
14515
14516
14517/*
14518 * PMOVZXBW / VPMOVZXBW
14519 */
14520IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14521{
14522 RTUINT64U uSrc1 = { uSrc };
14523 puDst->au16[0] = uSrc1.au8[0];
14524 puDst->au16[1] = uSrc1.au8[1];
14525 puDst->au16[2] = uSrc1.au8[2];
14526 puDst->au16[3] = uSrc1.au8[3];
14527 puDst->au16[4] = uSrc1.au8[4];
14528 puDst->au16[5] = uSrc1.au8[5];
14529 puDst->au16[6] = uSrc1.au8[6];
14530 puDst->au16[7] = uSrc1.au8[7];
14531}
14532
14533
14534IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbw_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14535{
14536 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14537 puDst->au16[ 0] = uSrc1.au8[ 0];
14538 puDst->au16[ 1] = uSrc1.au8[ 1];
14539 puDst->au16[ 2] = uSrc1.au8[ 2];
14540 puDst->au16[ 3] = uSrc1.au8[ 3];
14541 puDst->au16[ 4] = uSrc1.au8[ 4];
14542 puDst->au16[ 5] = uSrc1.au8[ 5];
14543 puDst->au16[ 6] = uSrc1.au8[ 6];
14544 puDst->au16[ 7] = uSrc1.au8[ 7];
14545 puDst->au16[ 8] = uSrc1.au8[ 8];
14546 puDst->au16[ 9] = uSrc1.au8[ 9];
14547 puDst->au16[10] = uSrc1.au8[10];
14548 puDst->au16[11] = uSrc1.au8[11];
14549 puDst->au16[12] = uSrc1.au8[12];
14550 puDst->au16[13] = uSrc1.au8[13];
14551 puDst->au16[14] = uSrc1.au8[14];
14552 puDst->au16[15] = uSrc1.au8[15];
14553}
14554
14555
14556/*
14557 * PMOVZXBD / VPMOVZXBD
14558 */
14559IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14560{
14561 RTUINT32U uSrc1 = { uSrc };
14562 puDst->au32[0] = uSrc1.au8[0];
14563 puDst->au32[1] = uSrc1.au8[1];
14564 puDst->au32[2] = uSrc1.au8[2];
14565 puDst->au32[3] = uSrc1.au8[3];
14566}
14567
14568
14569IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14570{
14571 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14572 puDst->au32[0] = uSrc1.au8[0];
14573 puDst->au32[1] = uSrc1.au8[1];
14574 puDst->au32[2] = uSrc1.au8[2];
14575 puDst->au32[3] = uSrc1.au8[3];
14576 puDst->au32[4] = uSrc1.au8[4];
14577 puDst->au32[5] = uSrc1.au8[5];
14578 puDst->au32[6] = uSrc1.au8[6];
14579 puDst->au32[7] = uSrc1.au8[7];
14580}
14581
14582
14583/*
14584 * PMOVZXBQ / VPMOVZXBQ
14585 */
14586IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u128_fallback,(PRTUINT128U puDst, uint16_t uSrc))
14587{
14588 RTUINT16U uSrc1 = { uSrc };
14589 puDst->au64[0] = uSrc1.au8[0];
14590 puDst->au64[1] = uSrc1.au8[1];
14591}
14592
14593
14594IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxbq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14595{
14596 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14597 puDst->au64[0] = uSrc1.au8[0];
14598 puDst->au64[1] = uSrc1.au8[1];
14599 puDst->au64[2] = uSrc1.au8[2];
14600 puDst->au64[3] = uSrc1.au8[3];
14601}
14602
14603
14604/*
14605 * PMOVZXWD / VPMOVZXWD
14606 */
14607IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14608{
14609 RTUINT64U uSrc1 = { uSrc };
14610 puDst->au32[0] = uSrc1.au16[0];
14611 puDst->au32[1] = uSrc1.au16[1];
14612 puDst->au32[2] = uSrc1.au16[2];
14613 puDst->au32[3] = uSrc1.au16[3];
14614}
14615
14616
14617IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwd_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14618{
14619 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14620 puDst->au32[0] = uSrc1.au16[0];
14621 puDst->au32[1] = uSrc1.au16[1];
14622 puDst->au32[2] = uSrc1.au16[2];
14623 puDst->au32[3] = uSrc1.au16[3];
14624 puDst->au32[4] = uSrc1.au16[4];
14625 puDst->au32[5] = uSrc1.au16[5];
14626 puDst->au32[6] = uSrc1.au16[6];
14627 puDst->au32[7] = uSrc1.au16[7];
14628}
14629
14630
14631/*
14632 * PMOVZXWQ / VPMOVZXWQ
14633 */
14634IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u128_fallback,(PRTUINT128U puDst, uint32_t uSrc))
14635{
14636 RTUINT32U uSrc1 = { uSrc };
14637 puDst->au64[0] = uSrc1.au16[0];
14638 puDst->au64[1] = uSrc1.au16[1];
14639}
14640
14641
14642IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxwq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14643{
14644 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14645 puDst->au64[0] = uSrc1.au16[0];
14646 puDst->au64[1] = uSrc1.au16[1];
14647 puDst->au64[2] = uSrc1.au16[2];
14648 puDst->au64[3] = uSrc1.au16[3];
14649}
14650
14651
14652/*
14653 * PMOVZXDQ / VPMOVZXDQ
14654 */
14655IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u128_fallback,(PRTUINT128U puDst, uint64_t uSrc))
14656{
14657 RTUINT64U uSrc1 = { uSrc };
14658 puDst->au64[0] = uSrc1.au32[0];
14659 puDst->au64[1] = uSrc1.au32[1];
14660}
14661
14662
14663IEM_DECL_IMPL_DEF(void, iemAImpl_vpmovzxdq_u256_fallback,(PRTUINT256U puDst, PCRTUINT128U puSrc))
14664{
14665 RTUINT128U uSrc1 = *puSrc; /* puDst could overlap */
14666 puDst->au64[0] = uSrc1.au32[0];
14667 puDst->au64[1] = uSrc1.au32[1];
14668 puDst->au64[2] = uSrc1.au32[2];
14669 puDst->au64[3] = uSrc1.au32[3];
14670}
14671
14672/**
14673 * Converts from the packed IPRT 32-bit (single precision) floating point format to
14674 * the SoftFloat 32-bit floating point format (float32_t).
14675 *
14676 * This is only a structure format conversion, nothing else.
14677 */
14678DECLINLINE(float32_t) iemFpSoftF32FromIprt(PCRTFLOAT32U pr32Val)
14679{
14680 float32_t Tmp;
14681 Tmp.v = pr32Val->u;
14682 return Tmp;
14683}
14684
14685
14686/**
14687 * Converts from SoftFloat 32-bit floating point format (float32_t)
14688 * to the packed IPRT 32-bit floating point (RTFLOAT32U) format.
14689 *
14690 * This is only a structure format conversion, nothing else.
14691 */
14692DECLINLINE(PRTFLOAT32U) iemFpSoftF32ToIprt(PRTFLOAT32U pr32Dst, float32_t const r32XSrc)
14693{
14694 pr32Dst->u = r32XSrc.v;
14695 return pr32Dst;
14696}
14697
14698
14699/**
14700 * Converts from the packed IPRT 64-bit (single precision) floating point format to
14701 * the SoftFloat 64-bit floating point format (float64_t).
14702 *
14703 * This is only a structure format conversion, nothing else.
14704 */
14705DECLINLINE(float64_t) iemFpSoftF64FromIprt(PCRTFLOAT64U pr64Val)
14706{
14707 float64_t Tmp;
14708 Tmp.v = pr64Val->u;
14709 return Tmp;
14710}
14711
14712
14713/**
14714 * Converts from SoftFloat 64-bit floating point format (float64_t)
14715 * to the packed IPRT 64-bit floating point (RTFLOAT64U) format.
14716 *
14717 * This is only a structure format conversion, nothing else.
14718 */
14719DECLINLINE(PRTFLOAT64U) iemFpSoftF64ToIprt(PRTFLOAT64U pr64Dst, float64_t const r64XSrc)
14720{
14721 pr64Dst->u = r64XSrc.v;
14722 return pr64Dst;
14723}
14724
14725
14726/** Initializer for the SoftFloat state structure. */
14727# define IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(a_Mxcsr) \
14728 { \
14729 softfloat_tininess_afterRounding, \
14730 ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_NEAREST ? (uint8_t)softfloat_round_near_even \
14731 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_UP ? (uint8_t)softfloat_round_max \
14732 : ((a_Mxcsr) & X86_MXCSR_RC_MASK) == X86_MXCSR_RC_DOWN ? (uint8_t)softfloat_round_min \
14733 : (uint8_t)softfloat_round_minMag, \
14734 0, \
14735 (uint8_t)(((a_Mxcsr) & X86_MXCSR_XCPT_MASK) >> X86_MXCSR_XCPT_MASK_SHIFT), /* Matches X86_FSW_?E */\
14736 32 /* Rounding precision, not relevant for SIMD. */ \
14737 }
14738
14739#ifdef IEM_WITHOUT_ASSEMBLY
14740
14741/**
14742 * Helper for transfering exception to MXCSR and setting the result value
14743 * accordingly.
14744 *
14745 * @returns Updated MXCSR.
14746 * @param pSoftState The SoftFloat state following the operation.
14747 * @param r32Result The result of the SoftFloat operation.
14748 * @param pr32Result Where to store the result for IEM.
14749 * @param fMxcsr The original MXCSR value.
14750 */
14751DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float32_t r32Result,
14752 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14753{
14754 iemFpSoftF32ToIprt(pr32Result, r32Result);
14755
14756 uint8_t fXcpt = pSoftState->exceptionFlags;
14757 if ( (fMxcsr & X86_MXCSR_FZ)
14758 && RTFLOAT32U_IS_SUBNORMAL(pr32Result))
14759 {
14760 /* Underflow masked and flush to zero is set. */
14761 pr32Result->s.uFraction = 0;
14762 pr32Result->s.uExponent = 0;
14763 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14764 }
14765
14766 /* If DAZ is set \#DE is never set. */
14767 if ( fMxcsr & X86_MXCSR_DAZ
14768 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14769 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14770 fXcpt &= ~X86_MXCSR_DE;
14771
14772 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14773}
14774
14775
14776/**
14777 * Helper for transfering exception to MXCSR and setting the result value
14778 * accordingly - ignores Flush-to-Zero.
14779 *
14780 * @returns Updated MXCSR.
14781 * @param pSoftState The SoftFloat state following the operation.
14782 * @param r32Result The result of the SoftFloat operation.
14783 * @param pr32Result Where to store the result for IEM.
14784 * @param fMxcsr The original MXCSR value.
14785 */
14786DECLINLINE(uint32_t) iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float32_t r32Result,
14787 PRTFLOAT32U pr32Result, uint32_t fMxcsr)
14788{
14789 iemFpSoftF32ToIprt(pr32Result, r32Result);
14790
14791 uint8_t fXcpt = pSoftState->exceptionFlags;
14792 /* If DAZ is set \#DE is never set. */
14793 if ( fMxcsr & X86_MXCSR_DAZ
14794 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14795 && (RTFLOAT32U_IS_SUBNORMAL(pr32Result))))
14796 fXcpt &= ~X86_MXCSR_DE;
14797
14798 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14799}
14800
14801
14802/**
14803 * Helper for transfering exception to MXCSR and setting the result value
14804 * accordingly.
14805 *
14806 * @returns Updated MXCSR.
14807 * @param pSoftState The SoftFloat state following the operation.
14808 * @param r64Result The result of the SoftFloat operation.
14809 * @param pr64Result Where to store the result for IEM.
14810 * @param fMxcsr The original MXCSR value.
14811 */
14812DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResult(softfloat_state_t const *pSoftState, float64_t r64Result,
14813 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14814{
14815 iemFpSoftF64ToIprt(pr64Result, r64Result);
14816 uint8_t fXcpt = pSoftState->exceptionFlags;
14817 if ( (fMxcsr & X86_MXCSR_FZ)
14818 && RTFLOAT64U_IS_SUBNORMAL(pr64Result))
14819 {
14820 /* Underflow masked and flush to zero is set. */
14821 iemFpSoftF64ToIprt(pr64Result, r64Result);
14822 pr64Result->s.uFractionHigh = 0;
14823 pr64Result->s.uFractionLow = 0;
14824 pr64Result->s.uExponent = 0;
14825 fXcpt |= X86_MXCSR_UE | X86_MXCSR_PE;
14826 }
14827
14828 /* If DAZ is set \#DE is never set. */
14829 if ( fMxcsr & X86_MXCSR_DAZ
14830 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14831 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14832 fXcpt &= ~X86_MXCSR_DE;
14833
14834 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14835}
14836
14837
14838/**
14839 * Helper for transfering exception to MXCSR and setting the result value
14840 * accordingly - ignores Flush-to-Zero.
14841 *
14842 * @returns Updated MXCSR.
14843 * @param pSoftState The SoftFloat state following the operation.
14844 * @param r64Result The result of the SoftFloat operation.
14845 * @param pr64Result Where to store the result for IEM.
14846 * @param fMxcsr The original MXCSR value.
14847 */
14848DECLINLINE(uint32_t) iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(softfloat_state_t const *pSoftState, float64_t r64Result,
14849 PRTFLOAT64U pr64Result, uint32_t fMxcsr)
14850{
14851 iemFpSoftF64ToIprt(pr64Result, r64Result);
14852
14853 uint8_t fXcpt = pSoftState->exceptionFlags;
14854 /* If DAZ is set \#DE is never set. */
14855 if ( fMxcsr & X86_MXCSR_DAZ
14856 || ( (fXcpt & X86_MXCSR_DE) /* Softfloat sets DE for sub-normal values. */
14857 && (RTFLOAT64U_IS_SUBNORMAL(pr64Result))))
14858 fXcpt &= ~X86_MXCSR_DE;
14859
14860 return fMxcsr | (fXcpt & X86_MXCSR_XCPT_FLAGS);
14861}
14862
14863#endif /* IEM_WITHOUT_ASSEMBLY */
14864
14865
14866/**
14867 * Sets the given single precision floating point input value to the given output taking the Denormals-as-zero flag
14868 * in MXCSR into account.
14869 *
14870 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14871 * @param pr32Val Where to store the result.
14872 * @param fMxcsr The input MXCSR value.
14873 * @param pr32Src The value to use.
14874 */
14875DECLINLINE(uint32_t) iemSsePrepareValueR32(PRTFLOAT32U pr32Val, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
14876{
14877 if (RTFLOAT32U_IS_SUBNORMAL(pr32Src))
14878 {
14879 if (fMxcsr & X86_MXCSR_DAZ)
14880 {
14881 /* De-normals are changed to 0. */
14882 pr32Val->s.fSign = pr32Src->s.fSign;
14883 pr32Val->s.uFraction = 0;
14884 pr32Val->s.uExponent = 0;
14885 return 0;
14886 }
14887
14888 *pr32Val = *pr32Src;
14889 return X86_MXCSR_DE;
14890 }
14891
14892 *pr32Val = *pr32Src;
14893 return 0;
14894}
14895
14896
14897/**
14898 * Sets the given double precision floating point input value to the given output taking the Denormals-as-zero flag
14899 * in MXCSR into account.
14900 *
14901 * @returns The output MXCSR De-normal flag if the input is a de-normal and the DAZ flag is not set.
14902 * @param pr64Val Where to store the result.
14903 * @param fMxcsr The input MXCSR value.
14904 * @param pr64Src The value to use.
14905 */
14906DECLINLINE(uint32_t) iemSsePrepareValueR64(PRTFLOAT64U pr64Val, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
14907{
14908 if (RTFLOAT64U_IS_SUBNORMAL(pr64Src))
14909 {
14910 if (fMxcsr & X86_MXCSR_DAZ)
14911 {
14912 /* De-normals are changed to 0. */
14913 pr64Val->s64.fSign = pr64Src->s.fSign;
14914 pr64Val->s64.uFraction = 0;
14915 pr64Val->s64.uExponent = 0;
14916 return 0;
14917 }
14918
14919 *pr64Val = *pr64Src;
14920 return X86_MXCSR_DE;
14921 }
14922
14923 *pr64Val = *pr64Src;
14924 return 0;
14925}
14926
14927#ifdef IEM_WITHOUT_ASSEMBLY
14928
14929/**
14930 * Validates the given input operands returning whether the operation can continue or whether one
14931 * of the source operands contains a NaN value, setting the output accordingly.
14932 *
14933 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14934 * @param pr32Res Where to store the result in case the operation can't continue.
14935 * @param pr32Val1 The first input operand.
14936 * @param pr32Val2 The second input operand.
14937 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14938 */
14939DECLINLINE(bool) iemSseBinaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2, uint32_t *pfMxcsr)
14940{
14941 uint8_t const cQNan = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) + RTFLOAT32U_IS_QUIET_NAN(pr32Val2);
14942 uint8_t const cSNan = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) + RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val2);
14943 if (cSNan + cQNan == 2)
14944 {
14945 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14946 *pr32Res = *pr32Val1;
14947 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14948 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14949 return true;
14950 }
14951 if (cSNan)
14952 {
14953 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14954 *pr32Res = RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14955 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
14956 *pfMxcsr |= X86_MXCSR_IE;
14957 return true;
14958 }
14959 if (cQNan)
14960 {
14961 /* The QNan operand is placed into the result. */
14962 *pr32Res = RTFLOAT32U_IS_QUIET_NAN(pr32Val1) ? *pr32Val1 : *pr32Val2;
14963 return true;
14964 }
14965
14966 Assert(!cQNan && !cSNan);
14967 return false;
14968}
14969
14970
14971/**
14972 * Validates the given double precision input operands returning whether the operation can continue or whether one
14973 * of the source operands contains a NaN value, setting the output accordingly.
14974 *
14975 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in one of the operands (true).
14976 * @param pr64Res Where to store the result in case the operation can't continue.
14977 * @param pr64Val1 The first input operand.
14978 * @param pr64Val2 The second input operand.
14979 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
14980 */
14981DECLINLINE(bool) iemSseBinaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2, uint32_t *pfMxcsr)
14982{
14983 uint8_t const cQNan = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) + RTFLOAT64U_IS_QUIET_NAN(pr64Val2);
14984 uint8_t const cSNan = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) + RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val2);
14985 if (cSNan + cQNan == 2)
14986 {
14987 /* Both values are either SNan or QNan, first operand is placed into the result and converted to a QNan. */
14988 *pr64Res = *pr64Val1;
14989 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14990 *pfMxcsr |= (cSNan ? X86_MXCSR_IE : 0);
14991 return true;
14992 }
14993 if (cSNan)
14994 {
14995 /* One operand is an SNan and placed into the result, converting it to a QNan. */
14996 *pr64Res = RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
14997 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
14998 *pfMxcsr |= X86_MXCSR_IE;
14999 return true;
15000 }
15001 if (cQNan)
15002 {
15003 /* The QNan operand is placed into the result. */
15004 *pr64Res = RTFLOAT64U_IS_QUIET_NAN(pr64Val1) ? *pr64Val1 : *pr64Val2;
15005 return true;
15006 }
15007
15008 Assert(!cQNan && !cSNan);
15009 return false;
15010}
15011
15012
15013/**
15014 * Validates the given single input operand returning whether the operation can continue or whether
15015 * contains a NaN value, setting the output accordingly.
15016 *
15017 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15018 * @param pr32Res Where to store the result in case the operation can't continue.
15019 * @param pr32Val The input operand.
15020 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15021 */
15022DECLINLINE(bool) iemSseUnaryValIsNaNR32(PRTFLOAT32U pr32Res, PCRTFLOAT32U pr32Val, uint32_t *pfMxcsr)
15023{
15024 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Val))
15025 {
15026 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15027 *pr32Res = *pr32Val;
15028 pr32Res->s.uFraction |= RT_BIT_32(RTFLOAT32U_FRACTION_BITS - 1);
15029 *pfMxcsr |= X86_MXCSR_IE;
15030 return true;
15031 }
15032 if (RTFLOAT32U_IS_QUIET_NAN(pr32Val))
15033 {
15034 /* The QNan operand is placed into the result. */
15035 *pr32Res = *pr32Val;
15036 return true;
15037 }
15038
15039 return false;
15040}
15041
15042
15043/**
15044 * Validates the given double input operand returning whether the operation can continue or whether
15045 * contains a NaN value, setting the output accordingly.
15046 *
15047 * @returns Flag whether the operation can continue (false) or whether a NaN value was detected in the operand (true).
15048 * @param pr64Res Where to store the result in case the operation can't continue.
15049 * @param pr64Val The input operand.
15050 * @param pfMxcsr Where to return the modified MXCSR state when false is returned.
15051 */
15052DECLINLINE(bool) iemSseUnaryValIsNaNR64(PRTFLOAT64U pr64Res, PCRTFLOAT64U pr64Val, uint32_t *pfMxcsr)
15053{
15054 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Val))
15055 {
15056 /* One operand is an SNan and placed into the result, converting it to a QNan. */
15057 *pr64Res = *pr64Val;
15058 pr64Res->s64.uFraction |= RT_BIT_64(RTFLOAT64U_FRACTION_BITS - 1);
15059 *pfMxcsr |= X86_MXCSR_IE;
15060 return true;
15061 }
15062 if (RTFLOAT64U_IS_QUIET_NAN(pr64Val))
15063 {
15064 /* The QNan operand is placed into the result. */
15065 *pr64Res = *pr64Val;
15066 return true;
15067 }
15068
15069 return false;
15070}
15071
15072#endif /* IEM_WITHOUT_ASSEMBLY */
15073
15074/**
15075 * ADDPS
15076 */
15077#ifdef IEM_WITHOUT_ASSEMBLY
15078static uint32_t iemAImpl_addps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15079{
15080 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15081 return fMxcsr;
15082
15083 RTFLOAT32U r32Src1, r32Src2;
15084 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15085 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15086 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15087 float32_t r32Result = f32_add(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15088 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15089}
15090
15091
15092IEM_DECL_IMPL_DEF(void, iemAImpl_addps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15093{
15094 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15095 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15096 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15097 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15098}
15099#endif
15100
15101
15102/**
15103 * ADDSS
15104 */
15105#ifdef IEM_WITHOUT_ASSEMBLY
15106IEM_DECL_IMPL_DEF(void, iemAImpl_addss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15107{
15108 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15109 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15110 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15111 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15112}
15113#endif
15114
15115
15116/**
15117 * ADDPD
15118 */
15119#ifdef IEM_WITHOUT_ASSEMBLY
15120static uint32_t iemAImpl_addpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15121{
15122 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15123 return fMxcsr;
15124
15125 RTFLOAT64U r64Src1, r64Src2;
15126 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15127 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15128 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15129 float64_t r64Result = f64_add(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15130 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15131}
15132
15133
15134IEM_DECL_IMPL_DEF(void, iemAImpl_addpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15135{
15136 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15137 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15138}
15139#endif
15140
15141
15142/**
15143 * ADDSD
15144 */
15145#ifdef IEM_WITHOUT_ASSEMBLY
15146IEM_DECL_IMPL_DEF(void, iemAImpl_addsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15147{
15148 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15149 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15150}
15151#endif
15152
15153
15154/**
15155 * MULPS
15156 */
15157#ifdef IEM_WITHOUT_ASSEMBLY
15158static uint32_t iemAImpl_mulps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15159{
15160 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15161 return fMxcsr;
15162
15163 RTFLOAT32U r32Src1, r32Src2;
15164 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15165 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15166 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15167 float32_t r32Result = f32_mul(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15168 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15169}
15170
15171
15172IEM_DECL_IMPL_DEF(void, iemAImpl_mulps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15173{
15174 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15175 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15176 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15177 pResult->MXCSR |= iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15178}
15179#endif
15180
15181
15182/**
15183 * MULSS
15184 */
15185#ifdef IEM_WITHOUT_ASSEMBLY
15186IEM_DECL_IMPL_DEF(void, iemAImpl_mulss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15187{
15188 pResult->MXCSR = iemAImpl_mulps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15189 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15190 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15191 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15192}
15193#endif
15194
15195
15196/**
15197 * MULPD
15198 */
15199#ifdef IEM_WITHOUT_ASSEMBLY
15200static uint32_t iemAImpl_mulpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15201{
15202 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15203 return fMxcsr;
15204
15205 RTFLOAT64U r64Src1, r64Src2;
15206 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15207 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15208 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15209 float64_t r64Result = f64_mul(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15210 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15211}
15212
15213
15214IEM_DECL_IMPL_DEF(void, iemAImpl_mulpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15215{
15216 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15217 pResult->MXCSR |= iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15218}
15219#endif
15220
15221
15222/**
15223 * MULSD
15224 */
15225#ifdef IEM_WITHOUT_ASSEMBLY
15226IEM_DECL_IMPL_DEF(void, iemAImpl_mulsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15227{
15228 pResult->MXCSR = iemAImpl_mulpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15229 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15230}
15231#endif
15232
15233
15234/**
15235 * SUBPS
15236 */
15237#ifdef IEM_WITHOUT_ASSEMBLY
15238static uint32_t iemAImpl_subps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15239{
15240 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15241 return fMxcsr;
15242
15243 RTFLOAT32U r32Src1, r32Src2;
15244 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15245 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15246 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15247 float32_t r32Result = f32_sub(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15248 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15249}
15250
15251
15252IEM_DECL_IMPL_DEF(void, iemAImpl_subps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15253{
15254 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15255 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15256 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15257 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15258}
15259#endif
15260
15261
15262/**
15263 * SUBSS
15264 */
15265#ifdef IEM_WITHOUT_ASSEMBLY
15266IEM_DECL_IMPL_DEF(void, iemAImpl_subss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15267{
15268 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15269 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15270 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15271 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15272}
15273#endif
15274
15275
15276/**
15277 * SUBPD
15278 */
15279#ifdef IEM_WITHOUT_ASSEMBLY
15280static uint32_t iemAImpl_subpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15281{
15282 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15283 return fMxcsr;
15284
15285 RTFLOAT64U r64Src1, r64Src2;
15286 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15287 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15288 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15289 float64_t r64Result = f64_sub(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15290 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15291}
15292
15293
15294IEM_DECL_IMPL_DEF(void, iemAImpl_subpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15295{
15296 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15297 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15298}
15299#endif
15300
15301
15302/**
15303 * SUBSD
15304 */
15305#ifdef IEM_WITHOUT_ASSEMBLY
15306IEM_DECL_IMPL_DEF(void, iemAImpl_subsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15307{
15308 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15309 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15310}
15311#endif
15312
15313
15314/**
15315 * MINPS
15316 */
15317#ifdef IEM_WITHOUT_ASSEMBLY
15318static uint32_t iemAImpl_minps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15319{
15320 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15321 {
15322 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15323 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15324 return fMxcsr | X86_MXCSR_IE;
15325 }
15326
15327 RTFLOAT32U r32Src1, r32Src2;
15328 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15329 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15330 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15331 {
15332 *pr32Res = r32Src2;
15333 return fMxcsr;
15334 }
15335
15336 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15337 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15338 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15339 fLe
15340 ? iemFpSoftF32FromIprt(&r32Src1)
15341 : iemFpSoftF32FromIprt(&r32Src2),
15342 pr32Res, fMxcsr);
15343}
15344
15345
15346IEM_DECL_IMPL_DEF(void, iemAImpl_minps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15347{
15348 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15349 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15350 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15351 pResult->MXCSR |= iemAImpl_minps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15352}
15353#endif
15354
15355
15356/**
15357 * MINSS
15358 */
15359#ifdef IEM_WITHOUT_ASSEMBLY
15360IEM_DECL_IMPL_DEF(void, iemAImpl_minss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15361{
15362 pResult->MXCSR = iemAImpl_minps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15363 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15364 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15365 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15366}
15367#endif
15368
15369
15370/**
15371 * MINPD
15372 */
15373#ifdef IEM_WITHOUT_ASSEMBLY
15374static uint32_t iemAImpl_minpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15375{
15376 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15377 {
15378 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15379 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15380 return fMxcsr | X86_MXCSR_IE;
15381 }
15382
15383 RTFLOAT64U r64Src1, r64Src2;
15384 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15385 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15386 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15387 {
15388 *pr64Res = r64Src2;
15389 return fMxcsr;
15390 }
15391
15392 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15393 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15394 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15395 fLe
15396 ? iemFpSoftF64FromIprt(&r64Src1)
15397 : iemFpSoftF64FromIprt(&r64Src2),
15398 pr64Res, fMxcsr);
15399}
15400
15401
15402IEM_DECL_IMPL_DEF(void, iemAImpl_minpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15403{
15404 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15405 pResult->MXCSR |= iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15406}
15407#endif
15408
15409
15410/**
15411 * MINSD
15412 */
15413#ifdef IEM_WITHOUT_ASSEMBLY
15414IEM_DECL_IMPL_DEF(void, iemAImpl_minsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15415{
15416 pResult->MXCSR = iemAImpl_minpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15417 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15418}
15419#endif
15420
15421
15422/**
15423 * DIVPS
15424 */
15425#ifdef IEM_WITHOUT_ASSEMBLY
15426static uint32_t iemAImpl_divps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15427{
15428 if (iemSseBinaryValIsNaNR32(pr32Res, pr32Val1, pr32Val2, &fMxcsr))
15429 return fMxcsr;
15430
15431 RTFLOAT32U r32Src1, r32Src2;
15432 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15433 fDe |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15434 if (RTFLOAT32U_IS_ZERO(&r32Src2))
15435 {
15436 if ( RTFLOAT32U_IS_ZERO(&r32Src1)
15437 || RTFLOAT32U_IS_QUIET_NAN(&r32Src1))
15438 {
15439 *pr32Res = g_ar32QNaN[1];
15440 return fMxcsr | X86_MXCSR_IE;
15441 }
15442 else if (RTFLOAT32U_IS_INF(&r32Src1))
15443 {
15444 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15445 return fMxcsr;
15446 }
15447 else
15448 {
15449 *pr32Res = g_ar32Infinity[r32Src1.s.fSign != r32Src2.s.fSign];
15450 return fMxcsr | X86_MXCSR_ZE;
15451 }
15452 }
15453
15454 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15455 float32_t r32Result = f32_div(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15456 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15457}
15458
15459
15460IEM_DECL_IMPL_DEF(void, iemAImpl_divps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15461{
15462 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15463 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15464 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15465 pResult->MXCSR |= iemAImpl_divps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15466}
15467#endif
15468
15469
15470/**
15471 * DIVSS
15472 */
15473#ifdef IEM_WITHOUT_ASSEMBLY
15474IEM_DECL_IMPL_DEF(void, iemAImpl_divss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15475{
15476 pResult->MXCSR = iemAImpl_divps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15477 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15478 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15479 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15480}
15481#endif
15482
15483
15484/**
15485 * DIVPD
15486 */
15487#ifdef IEM_WITHOUT_ASSEMBLY
15488static uint32_t iemAImpl_divpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15489{
15490 if (iemSseBinaryValIsNaNR64(pr64Res, pr64Val1, pr64Val2, &fMxcsr))
15491 return fMxcsr;
15492
15493 RTFLOAT64U r64Src1, r64Src2;
15494 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15495 fDe |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15496 if (RTFLOAT64U_IS_ZERO(&r64Src2))
15497 {
15498 if ( RTFLOAT64U_IS_ZERO(&r64Src1)
15499 || RTFLOAT64U_IS_QUIET_NAN(&r64Src1))
15500 {
15501 *pr64Res = g_ar64QNaN[1];
15502 return fMxcsr | X86_MXCSR_IE;
15503 }
15504 else if (RTFLOAT64U_IS_INF(&r64Src1))
15505 {
15506 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15507 return fMxcsr;
15508 }
15509 else
15510 {
15511 *pr64Res = g_ar64Infinity[r64Src1.s.fSign != r64Src2.s.fSign];
15512 return fMxcsr | X86_MXCSR_ZE;
15513 }
15514 }
15515
15516 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15517 float64_t r64Result = f64_div(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15518 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15519}
15520
15521
15522IEM_DECL_IMPL_DEF(void, iemAImpl_divpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15523{
15524 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15525 pResult->MXCSR |= iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15526}
15527#endif
15528
15529
15530/**
15531 * DIVSD
15532 */
15533#ifdef IEM_WITHOUT_ASSEMBLY
15534IEM_DECL_IMPL_DEF(void, iemAImpl_divsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15535{
15536 pResult->MXCSR = iemAImpl_divpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15537 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15538}
15539#endif
15540
15541
15542/**
15543 * MAXPS
15544 */
15545#ifdef IEM_WITHOUT_ASSEMBLY
15546static uint32_t iemAImpl_maxps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1, PCRTFLOAT32U pr32Val2)
15547{
15548 if (RTFLOAT32U_IS_NAN(pr32Val1) || RTFLOAT32U_IS_NAN(pr32Val2))
15549 {
15550 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15551 iemSsePrepareValueR32(pr32Res, fMxcsr, pr32Val2);
15552 return fMxcsr | X86_MXCSR_IE;
15553 }
15554
15555 RTFLOAT32U r32Src1, r32Src2;
15556 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15557 fMxcsr |= iemSsePrepareValueR32(&r32Src2, fMxcsr, pr32Val2);
15558 if (RTFLOAT32U_IS_ZERO(&r32Src1) && RTFLOAT32U_IS_ZERO(&r32Src2))
15559 {
15560 *pr32Res = r32Src2;
15561 return fMxcsr;
15562 }
15563
15564 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15565 bool fLe = f32_le(iemFpSoftF32FromIprt(&r32Src1), iemFpSoftF32FromIprt(&r32Src2), &SoftState);
15566 return iemSseSoftStateAndR32ToMxcsrAndIprtResultNoFz(&SoftState,
15567 fLe
15568 ? iemFpSoftF32FromIprt(&r32Src2)
15569 : iemFpSoftF32FromIprt(&r32Src1),
15570 pr32Res, fMxcsr);
15571}
15572
15573
15574IEM_DECL_IMPL_DEF(void, iemAImpl_maxps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15575{
15576 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15577 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15578 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15579 pResult->MXCSR |= iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15580}
15581#endif
15582
15583
15584/**
15585 * MAXSS
15586 */
15587#ifdef IEM_WITHOUT_ASSEMBLY
15588IEM_DECL_IMPL_DEF(void, iemAImpl_maxss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15589{
15590 pResult->MXCSR = iemAImpl_maxps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], pr32Src2);
15591 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15592 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15593 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15594}
15595#endif
15596
15597
15598/**
15599 * MAXPD
15600 */
15601#ifdef IEM_WITHOUT_ASSEMBLY
15602static uint32_t iemAImpl_maxpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1, PCRTFLOAT64U pr64Val2)
15603{
15604 if (RTFLOAT64U_IS_NAN(pr64Val1) || RTFLOAT64U_IS_NAN(pr64Val2))
15605 {
15606 /* The DAZ flag gets honored but the DE flag will not get set because \#IE has higher priority. */
15607 iemSsePrepareValueR64(pr64Res, fMxcsr, pr64Val2);
15608 return fMxcsr | X86_MXCSR_IE;
15609 }
15610
15611 RTFLOAT64U r64Src1, r64Src2;
15612 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15613 fMxcsr |= iemSsePrepareValueR64(&r64Src2, fMxcsr, pr64Val2);
15614 if (RTFLOAT64U_IS_ZERO(&r64Src1) && RTFLOAT64U_IS_ZERO(&r64Src2))
15615 {
15616 *pr64Res = r64Src2;
15617 return fMxcsr;
15618 }
15619
15620 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15621 bool fLe = f64_le(iemFpSoftF64FromIprt(&r64Src1), iemFpSoftF64FromIprt(&r64Src2), &SoftState);
15622 return iemSseSoftStateAndR64ToMxcsrAndIprtResultNoFz(&SoftState,
15623 fLe
15624 ? iemFpSoftF64FromIprt(&r64Src2)
15625 : iemFpSoftF64FromIprt(&r64Src1),
15626 pr64Res, fMxcsr);
15627}
15628
15629
15630IEM_DECL_IMPL_DEF(void, iemAImpl_maxpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15631{
15632 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15633 pResult->MXCSR |= iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15634}
15635#endif
15636
15637
15638/**
15639 * MAXSD
15640 */
15641#ifdef IEM_WITHOUT_ASSEMBLY
15642IEM_DECL_IMPL_DEF(void, iemAImpl_maxsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15643{
15644 pResult->MXCSR = iemAImpl_maxpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], pr64Src2);
15645 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15646}
15647#endif
15648
15649
15650/**
15651 * CVTSS2SD
15652 */
15653#ifdef IEM_WITHOUT_ASSEMBLY
15654static uint32_t iemAImpl_cvtss2sd_u128_r32_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15655{
15656 RTFLOAT32U r32Src1;
15657 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15658
15659 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15660 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15661 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15662}
15663
15664
15665IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2sd_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15666{
15667 pResult->MXCSR = iemAImpl_cvtss2sd_u128_r32_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr32Src2);
15668 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15669}
15670#endif
15671
15672
15673/**
15674 * CVTSD2SS
15675 */
15676#ifdef IEM_WITHOUT_ASSEMBLY
15677static uint32_t iemAImpl_cvtsd2ss_u128_r64_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15678{
15679 RTFLOAT64U r64Src1;
15680 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15681
15682 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15683 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15684 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15685}
15686
15687
15688IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2ss_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15689{
15690 pResult->MXCSR = iemAImpl_cvtsd2ss_u128_r64_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr64Src2);
15691 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15692 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15693 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15694}
15695#endif
15696
15697
15698/**
15699 * HADDPS
15700 */
15701#ifdef IEM_WITHOUT_ASSEMBLY
15702IEM_DECL_IMPL_DEF(void, iemAImpl_haddps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15703{
15704 pResult->MXCSR = iemAImpl_addps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15705 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15706 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15707 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15708}
15709#endif
15710
15711
15712/**
15713 * HADDPD
15714 */
15715#ifdef IEM_WITHOUT_ASSEMBLY
15716IEM_DECL_IMPL_DEF(void, iemAImpl_haddpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15717{
15718 pResult->MXCSR = iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15719 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15720}
15721#endif
15722
15723
15724/**
15725 * HSUBPS
15726 */
15727#ifdef IEM_WITHOUT_ASSEMBLY
15728IEM_DECL_IMPL_DEF(void, iemAImpl_hsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15729{
15730 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc1->ar32[1]);
15731 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc1->ar32[3]);
15732 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[0], &puSrc2->ar32[1]);
15733 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[2], &puSrc2->ar32[3]);
15734}
15735#endif
15736
15737
15738/**
15739 * HSUBPD
15740 */
15741#ifdef IEM_WITHOUT_ASSEMBLY
15742IEM_DECL_IMPL_DEF(void, iemAImpl_hsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15743{
15744 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc1->ar64[1]);
15745 pResult->MXCSR |= iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[0], &puSrc2->ar64[1]);
15746}
15747#endif
15748
15749
15750/**
15751 * SQRTPS
15752 */
15753#ifdef IEM_WITHOUT_ASSEMBLY
15754static uint32_t iemAImpl_sqrtps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15755{
15756 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15757 return fMxcsr;
15758
15759 RTFLOAT32U r32Src;
15760 uint32_t fDe = iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Val);
15761 if (RTFLOAT32U_IS_ZERO(&r32Src))
15762 {
15763 *pr32Res = r32Src;
15764 return fMxcsr;
15765 }
15766 else if (r32Src.s.fSign)
15767 {
15768 *pr32Res = g_ar32QNaN[1];
15769 return fMxcsr | X86_MXCSR_IE;
15770 }
15771
15772 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15773 float32_t r32Result = f32_sqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15774 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr | fDe);
15775}
15776
15777
15778IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15779{
15780 RT_NOREF(puSrc1);
15781
15782 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15783 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15784 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15785 pResult->MXCSR |= iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15786}
15787#endif
15788
15789
15790/**
15791 * SQRTSS
15792 */
15793#ifdef IEM_WITHOUT_ASSEMBLY
15794IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15795{
15796 pResult->MXCSR = iemAImpl_sqrtps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15797 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15798 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15799 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15800}
15801#endif
15802
15803
15804/**
15805 * SQRTPD
15806 */
15807#ifdef IEM_WITHOUT_ASSEMBLY
15808static uint32_t iemAImpl_sqrtpd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val)
15809{
15810 if (iemSseUnaryValIsNaNR64(pr64Res, pr64Val, &fMxcsr))
15811 return fMxcsr;
15812
15813 RTFLOAT64U r64Src;
15814 uint32_t fDe = iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Val);
15815 if (RTFLOAT64U_IS_ZERO(&r64Src))
15816 {
15817 *pr64Res = r64Src;
15818 return fMxcsr;
15819 }
15820 else if (r64Src.s.fSign)
15821 {
15822 *pr64Res = g_ar64QNaN[1];
15823 return fMxcsr | X86_MXCSR_IE;
15824 }
15825
15826 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15827 float64_t r64Result = f64_sqrt(iemFpSoftF64FromIprt(&r64Src), &SoftState);
15828 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr | fDe);
15829}
15830
15831
15832IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15833{
15834 RT_NOREF(puSrc1);
15835
15836 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15837 pResult->MXCSR |= iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15838}
15839#endif
15840
15841
15842/**
15843 * SQRTSD
15844 */
15845#ifdef IEM_WITHOUT_ASSEMBLY
15846IEM_DECL_IMPL_DEF(void, iemAImpl_sqrtsd_u128_r64,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT64U pr64Src2))
15847{
15848 pResult->MXCSR = iemAImpl_sqrtpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, pr64Src2);
15849 pResult->uResult.ar64[1] = puSrc1->ar64[1];
15850}
15851#endif
15852
15853
15854#ifdef IEM_WITHOUT_ASSEMBLY
15855/**
15856 * RSQRTPS
15857 */
15858static uint32_t iemAImpl_rsqrt_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val)
15859{
15860 if (iemSseUnaryValIsNaNR32(pr32Res, pr32Val, &fMxcsr))
15861 return fMxcsr;
15862
15863 RTFLOAT32U r32Src;
15864 iemSsePrepareValueR32(&r32Src, fMxcsr | X86_MXCSR_DAZ, pr32Val);
15865 if (RTFLOAT32U_IS_ZERO(&r32Src))
15866 {
15867 *pr32Res = g_ar32Infinity[r32Src.s.fSign];
15868 return fMxcsr;
15869 }
15870 else if (r32Src.s.fSign)
15871 {
15872 *pr32Res = g_ar32QNaN[1];
15873 return fMxcsr | X86_MXCSR_IE;
15874 }
15875
15876 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15877 float32_t r32Result = f32_rsqrt(iemFpSoftF32FromIprt(&r32Src), &SoftState);
15878 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15879}
15880
15881
15882IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15883{
15884 RT_NOREF(puSrc1);
15885
15886 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15887 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15888 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
15889 pResult->MXCSR |= iemAImpl_rsqrt_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
15890}
15891
15892
15893/**
15894 * RSQRTSS
15895 */
15896IEM_DECL_IMPL_DEF(void, iemAImpl_rsqrtss_u128_r32,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCRTFLOAT32U pr32Src2))
15897{
15898 pResult->MXCSR = iemAImpl_rsqrt_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, pr32Src2);
15899 pResult->uResult.ar32[1] = puSrc1->ar32[1];
15900 pResult->uResult.ar32[2] = puSrc1->ar32[2];
15901 pResult->uResult.ar32[3] = puSrc1->ar32[3];
15902}
15903#endif
15904
15905
15906/**
15907 * ADDSUBPS
15908 */
15909#ifdef IEM_WITHOUT_ASSEMBLY
15910IEM_DECL_IMPL_DEF(void, iemAImpl_addsubps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15911{
15912 RT_NOREF(puSrc1);
15913
15914 pResult->MXCSR = iemAImpl_subps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc1->ar32[0], &puSrc2->ar32[0]);
15915 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc1->ar32[1], &puSrc2->ar32[1]);
15916 pResult->MXCSR |= iemAImpl_subps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, &puSrc1->ar32[2], &puSrc2->ar32[2]);
15917 pResult->MXCSR |= iemAImpl_addps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, &puSrc1->ar32[3], &puSrc2->ar32[3]);
15918}
15919#endif
15920
15921
15922/**
15923 * ADDSUBPD
15924 */
15925#ifdef IEM_WITHOUT_ASSEMBLY
15926IEM_DECL_IMPL_DEF(void, iemAImpl_addsubpd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15927{
15928 RT_NOREF(puSrc1);
15929
15930 pResult->MXCSR = iemAImpl_subpd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc1->ar64[0], &puSrc2->ar64[0]);
15931 pResult->MXCSR |= iemAImpl_addpd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc1->ar64[1], &puSrc2->ar64[1]);
15932}
15933#endif
15934
15935
15936/**
15937 * CVTPD2PS
15938 */
15939#ifdef IEM_WITHOUT_ASSEMBLY
15940static uint32_t iemAImpl_cvtpd2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Val1)
15941{
15942 RTFLOAT64U r64Src1;
15943 fMxcsr |= iemSsePrepareValueR64(&r64Src1, fMxcsr, pr64Val1);
15944
15945 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15946 float32_t r32Result = f64_to_f32(iemFpSoftF64FromIprt(&r64Src1), &SoftState);
15947 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15948}
15949
15950
15951IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15952{
15953 RT_NOREF(puSrc1);
15954
15955 pResult->MXCSR = iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
15956 pResult->MXCSR |= iemAImpl_cvtpd2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
15957 pResult->uResult.au32[2] = 0;
15958 pResult->uResult.au32[3] = 0;
15959}
15960#endif
15961
15962
15963/**
15964 * CVTPS2PD
15965 */
15966#ifdef IEM_WITHOUT_ASSEMBLY
15967static uint32_t iemAImpl_cvtps2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Val1)
15968{
15969 RTFLOAT32U r32Src1;
15970 fMxcsr |= iemSsePrepareValueR32(&r32Src1, fMxcsr, pr32Val1);
15971
15972 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15973 float64_t r64Result = f32_to_f64(iemFpSoftF32FromIprt(&r32Src1), &SoftState);
15974 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
15975}
15976
15977
15978IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
15979{
15980 RT_NOREF(puSrc1);
15981
15982 pResult->MXCSR = iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
15983 pResult->MXCSR |= iemAImpl_cvtps2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
15984}
15985#endif
15986
15987
15988/**
15989 * CVTDQ2PS
15990 */
15991#ifdef IEM_WITHOUT_ASSEMBLY
15992static uint32_t iemAImpl_cvtdq2ps_u128_worker(PRTFLOAT32U pr32Res, uint32_t fMxcsr, int32_t i32Val)
15993{
15994 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
15995 float32_t r32Result = i32_to_f32(i32Val, &SoftState);
15996 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Result, pr32Res, fMxcsr);
15997}
15998
15999
16000IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2ps_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16001{
16002 RT_NOREF(puSrc1);
16003
16004 pResult->MXCSR = iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16005 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16006 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[2], pFpuState->MXCSR, puSrc2->ai32[2]);
16007 pResult->MXCSR |= iemAImpl_cvtdq2ps_u128_worker(&pResult->uResult.ar32[3], pFpuState->MXCSR, puSrc2->ai32[3]);
16008}
16009#endif
16010
16011
16012/**
16013 * CVTPS2DQ
16014 */
16015#ifdef IEM_WITHOUT_ASSEMBLY
16016static uint32_t iemAImpl_cvtps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16017{
16018 RTFLOAT32U r32Src;
16019 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16020
16021 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16022 *pi32Res = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16023 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16024}
16025
16026
16027IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16028{
16029 RT_NOREF(puSrc1);
16030
16031 pResult->MXCSR = iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16032 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16033 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16034 pResult->MXCSR |= iemAImpl_cvtps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16035}
16036#endif
16037
16038
16039/**
16040 * CVTTPS2DQ
16041 */
16042#ifdef IEM_WITHOUT_ASSEMBLY
16043static uint32_t iemAImpl_cvttps2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT32U pr32Src)
16044{
16045 RTFLOAT32U r32Src;
16046 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* De-normal seems to be ignored. */
16047
16048 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16049 SoftState.roundingMode = softfloat_round_minMag;
16050 *pi32Res = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
16051 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16052}
16053
16054
16055IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16056{
16057 RT_NOREF(puSrc1);
16058
16059 pResult->MXCSR = iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar32[0]);
16060 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar32[1]);
16061 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[2], pFpuState->MXCSR, &puSrc2->ar32[2]);
16062 pResult->MXCSR |= iemAImpl_cvttps2dq_u128_worker(&pResult->uResult.ai32[3], pFpuState->MXCSR, &puSrc2->ar32[3]);
16063}
16064#endif
16065
16066
16067/**
16068 * CVTTPD2DQ
16069 */
16070#ifdef IEM_WITHOUT_ASSEMBLY
16071static uint32_t iemAImpl_cvttpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16072{
16073 RTFLOAT64U r64Src;
16074 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16075
16076 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16077 SoftState.roundingMode = softfloat_round_minMag;
16078 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16079 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16080}
16081
16082
16083IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16084{
16085 RT_NOREF(puSrc1);
16086
16087 pResult->MXCSR = iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16088 pResult->MXCSR |= iemAImpl_cvttpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16089 pResult->uResult.au64[1] = 0;
16090}
16091#endif
16092
16093
16094/**
16095 * CVTDQ2PD
16096 */
16097#ifdef IEM_WITHOUT_ASSEMBLY
16098static uint32_t iemAImpl_cvtdq2pd_u128_worker(PRTFLOAT64U pr64Res, uint32_t fMxcsr, int32_t i32Val)
16099{
16100 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16101 float64_t r64Result = i32_to_f64(i32Val, &SoftState);
16102 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Result, pr64Res, fMxcsr);
16103}
16104
16105
16106IEM_DECL_IMPL_DEF(void, iemAImpl_cvtdq2pd_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16107{
16108 RT_NOREF(puSrc1);
16109
16110 pResult->MXCSR = iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[0], pFpuState->MXCSR, puSrc2->ai32[0]);
16111 pResult->MXCSR |= iemAImpl_cvtdq2pd_u128_worker(&pResult->uResult.ar64[1], pFpuState->MXCSR, puSrc2->ai32[1]);
16112}
16113#endif
16114
16115
16116/**
16117 * CVTPD2DQ
16118 */
16119#ifdef IEM_WITHOUT_ASSEMBLY
16120static uint32_t iemAImpl_cvtpd2dq_u128_worker(int32_t *pi32Res, uint32_t fMxcsr, PCRTFLOAT64U pr64Src)
16121{
16122 RTFLOAT64U r64Src;
16123 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* De-normal seems to be ignored. */
16124
16125 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
16126 *pi32Res = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
16127 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
16128}
16129
16130
16131IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2dq_u128,(PX86FXSTATE pFpuState, PIEMSSERESULT pResult, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
16132{
16133 RT_NOREF(puSrc1);
16134
16135 pResult->MXCSR = iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[0], pFpuState->MXCSR, &puSrc2->ar64[0]);
16136 pResult->MXCSR |= iemAImpl_cvtpd2dq_u128_worker(&pResult->uResult.ai32[1], pFpuState->MXCSR, &puSrc2->ar64[1]);
16137 pResult->uResult.au64[1] = 0;
16138}
16139#endif
16140
16141
16142/**
16143 * [V]SHUFPS
16144 */
16145#ifdef IEM_WITHOUT_ASSEMBLY
16146IEM_DECL_IMPL_DEF(void, iemAImpl_shufps_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16147{
16148 RTUINT128U const uSrc1 = *puDst;
16149 RTUINT128U const uSrc2 = *puSrc;
16150 ASMCompilerBarrier();
16151 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16152 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16153 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16154 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16155}
16156#endif
16157
16158
16159IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16160{
16161 RTUINT128U const uSrc1 = *puSrc1;
16162 RTUINT128U const uSrc2 = *puSrc2;
16163 ASMCompilerBarrier();
16164 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16165 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16166 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16167 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16168}
16169
16170
16171IEM_DECL_IMPL_DEF(void, iemAImpl_vshufps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16172{
16173 RTUINT256U const uSrc1 = *puSrc1;
16174 RTUINT256U const uSrc2 = *puSrc2;
16175 ASMCompilerBarrier();
16176 puDst->au32[0] = uSrc1.au32[bEvil & 0x3];
16177 puDst->au32[1] = uSrc1.au32[(bEvil >> 2) & 0x3];
16178 puDst->au32[2] = uSrc2.au32[(bEvil >> 4) & 0x3];
16179 puDst->au32[3] = uSrc2.au32[(bEvil >> 6) & 0x3];
16180
16181 puDst->au32[4] = uSrc1.au32[4 + (bEvil & 0x3)];
16182 puDst->au32[5] = uSrc1.au32[4 + ((bEvil >> 2) & 0x3)];
16183 puDst->au32[6] = uSrc2.au32[4 + ((bEvil >> 4) & 0x3)];
16184 puDst->au32[7] = uSrc2.au32[4 + ((bEvil >> 6) & 0x3)];
16185}
16186
16187
16188/**
16189 * [V]SHUFPD
16190 */
16191#ifdef IEM_WITHOUT_ASSEMBLY
16192IEM_DECL_IMPL_DEF(void, iemAImpl_shufpd_u128,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16193{
16194 RTUINT128U const uSrc1 = *puDst;
16195 RTUINT128U const uSrc2 = *puSrc;
16196 ASMCompilerBarrier();
16197 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16198 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16199}
16200#endif
16201
16202
16203IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16204{
16205 RTUINT128U const uSrc1 = *puSrc1;
16206 RTUINT128U const uSrc2 = *puSrc2;
16207 ASMCompilerBarrier();
16208 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16209 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16210}
16211
16212
16213IEM_DECL_IMPL_DEF(void, iemAImpl_vshufpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16214{
16215 RTUINT256U const uSrc1 = *puSrc1;
16216 RTUINT256U const uSrc2 = *puSrc2;
16217 ASMCompilerBarrier();
16218 puDst->au64[0] = (bEvil & RT_BIT(0)) ? uSrc1.au64[1] : uSrc1.au64[0];
16219 puDst->au64[1] = (bEvil & RT_BIT(1)) ? uSrc2.au64[1] : uSrc2.au64[0];
16220 puDst->au64[2] = (bEvil & RT_BIT(2)) ? uSrc1.au64[3] : uSrc1.au64[2];
16221 puDst->au64[3] = (bEvil & RT_BIT(3)) ? uSrc2.au64[3] : uSrc2.au64[2];
16222}
16223
16224
16225/*
16226 * PHMINPOSUW / VPHMINPOSUW
16227 */
16228IEM_DECL_IMPL_DEF(void, iemAImpl_phminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16229{
16230 uint16_t u16Min = puSrc->au16[0];
16231 uint8_t idxMin = 0;
16232
16233 for (uint8_t i = 1; i < RT_ELEMENTS(puSrc->au16); i++)
16234 if (puSrc->au16[i] < u16Min)
16235 {
16236 u16Min = puSrc->au16[i];
16237 idxMin = i;
16238 }
16239
16240 puDst->au64[0] = 0;
16241 puDst->au64[1] = 0;
16242 puDst->au16[0] = u16Min;
16243 puDst->au16[1] = idxMin;
16244}
16245
16246
16247IEM_DECL_IMPL_DEF(void, iemAImpl_vphminposuw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16248{
16249 iemAImpl_phminposuw_u128_fallback(puDst, puSrc);
16250}
16251
16252
16253/*
16254 * [V]PBLENDVB
16255 */
16256IEM_DECL_IMPL_DEF(void, iemAImpl_pblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16257{
16258 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16259 if (puMask->au8[i] & RT_BIT(7))
16260 puDst->au8[i] = puSrc->au8[i];
16261}
16262
16263
16264IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16265{
16266 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16267 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16268}
16269
16270
16271IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendvb_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16272{
16273 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
16274 puDst->au8[i] = puMask->au8[i] & RT_BIT(7) ? puSrc2->au8[i] : puSrc1->au8[i];
16275}
16276
16277
16278/*
16279 * [V]BLENDVPS
16280 */
16281IEM_DECL_IMPL_DEF(void, iemAImpl_blendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16282{
16283 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16284 if (puMask->au32[i] & RT_BIT_32(31))
16285 puDst->au32[i] = puSrc->au32[i];
16286}
16287
16288
16289IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16290{
16291 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16292 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16293}
16294
16295
16296IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16297{
16298 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16299 puDst->au32[i] = (puMask->au32[i] & RT_BIT_32(31)) ? puSrc2->au32[i] : puSrc1->au32[i];
16300}
16301
16302
16303/*
16304 * [V]BLENDVPD
16305 */
16306IEM_DECL_IMPL_DEF(void, iemAImpl_blendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puMask))
16307{
16308 if (puMask->au64[0] & RT_BIT_64(63)) puDst->au64[0] = puSrc->au64[0];
16309 if (puMask->au64[1] & RT_BIT_64(63)) puDst->au64[1] = puSrc->au64[1];
16310}
16311
16312
16313IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, PCRTUINT128U puMask))
16314{
16315 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16316 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16317}
16318
16319
16320IEM_DECL_IMPL_DEF(void, iemAImpl_vblendvpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, PCRTUINT256U puMask))
16321{
16322 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16323 puDst->au64[i] = (puMask->au64[i] & RT_BIT_64(63)) ? puSrc2->au64[i] : puSrc1->au64[i];
16324}
16325
16326
16327/**
16328 * [V]PALIGNR
16329 */
16330IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u64_fallback,(uint64_t *pu64Dst, uint64_t u64Src2, uint8_t bEvil))
16331{
16332 uint64_t const u64Src1 = *pu64Dst;
16333 ASMCompilerBarrier();
16334
16335 if (bEvil >= 16)
16336 *pu64Dst = 0;
16337 else if (bEvil >= 8)
16338 *pu64Dst = u64Src1 >> ((bEvil - 8) * 8);
16339 else
16340 {
16341 uint8_t cShift = bEvil * 8;
16342 *pu64Dst = ((u64Src1 & (RT_BIT_64(cShift) - 1)) << ((8 - bEvil) * 8))
16343 | (u64Src2 >> cShift);
16344 }
16345}
16346
16347
16348IEM_DECL_IMPL_DEF(void, iemAImpl_palignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16349{
16350 RTUINT128U const uSrc1 = *puDst;
16351 RTUINT128U const uSrc2 = *puSrc;
16352 ASMCompilerBarrier();
16353
16354 puDst->au64[0] = 0;
16355 puDst->au64[1] = 0;
16356 if (bEvil >= 32)
16357 { /* Everything stays 0. */ }
16358 else if (bEvil >= 16)
16359 {
16360 bEvil -= 16;
16361 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16362 puDst->au8[i - bEvil] = uSrc1.au8[i];
16363 }
16364 else
16365 {
16366 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16367 puDst->au8[i] = uSrc2.au8[i + bEvil];
16368 for (uint8_t i = 0; i < bEvil; i++)
16369 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16370 }
16371}
16372
16373
16374IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16375{
16376 RTUINT128U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16377 RTUINT128U const uSrc2 = *puSrc2;
16378 ASMCompilerBarrier();
16379
16380 puDst->au64[0] = 0;
16381 puDst->au64[1] = 0;
16382 if (bEvil >= 32)
16383 { /* Everything stays 0. */ }
16384 else if (bEvil >= 16)
16385 {
16386 bEvil -= 16;
16387 for (uint8_t i = bEvil; i < RT_ELEMENTS(puDst->au8); i++)
16388 puDst->au8[i - bEvil] = uSrc1.au8[i];
16389 }
16390 else
16391 {
16392 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8) - bEvil; i++)
16393 puDst->au8[i] = uSrc2.au8[i + bEvil];
16394 for (uint8_t i = 0; i < bEvil; i++)
16395 puDst->au8[i + RT_ELEMENTS(puDst->au8) - bEvil] = uSrc1.au8[i];
16396 }
16397}
16398
16399
16400IEM_DECL_IMPL_DEF(void, iemAImpl_vpalignr_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16401{
16402 RTUINT256U const uSrc1 = *puSrc1; /* Might overlap with destination. */
16403 RTUINT256U const uSrc2 = *puSrc2;
16404 ASMCompilerBarrier();
16405
16406 iemAImpl_vpalignr_u128_fallback(&puDst->au128[0], &uSrc1.au128[0], &uSrc2.au128[0], bEvil);
16407 iemAImpl_vpalignr_u128_fallback(&puDst->au128[1], &uSrc1.au128[1], &uSrc2.au128[1], bEvil);
16408}
16409
16410
16411/**
16412 * [V]PBLENDW
16413 */
16414IEM_DECL_IMPL_DEF(void, iemAImpl_pblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16415{
16416 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16417 if (bEvil & RT_BIT(i))
16418 puDst->au16[i] = puSrc->au16[i];
16419}
16420
16421
16422IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16423{
16424 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
16425 if (bEvil & RT_BIT(i))
16426 puDst->au16[i] = puSrc2->au16[i];
16427 else
16428 puDst->au16[i] = puSrc1->au16[i];
16429}
16430
16431
16432IEM_DECL_IMPL_DEF(void, iemAImpl_vpblendw_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16433{
16434 for (uint8_t i = 0; i < 8; i++)
16435 if (bEvil & RT_BIT(i))
16436 {
16437 puDst->au16[ i] = puSrc2->au16[ i];
16438 puDst->au16[8 + i] = puSrc2->au16[8 + i];
16439 }
16440 else
16441 {
16442 puDst->au16[ i] = puSrc1->au16[ i];
16443 puDst->au16[8 + i] = puSrc1->au16[8 + i];
16444 }
16445}
16446
16447
16448/**
16449 * [V]BLENDPS
16450 */
16451IEM_DECL_IMPL_DEF(void, iemAImpl_blendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16452{
16453 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16454 if (bEvil & RT_BIT(i))
16455 puDst->au32[i] = puSrc->au32[i];
16456}
16457
16458
16459IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16460{
16461 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16462 if (bEvil & RT_BIT(i))
16463 puDst->au32[i] = puSrc2->au32[i];
16464 else
16465 puDst->au32[i] = puSrc1->au32[i];
16466}
16467
16468
16469IEM_DECL_IMPL_DEF(void, iemAImpl_vblendps_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16470{
16471 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au32); i++)
16472 if (bEvil & RT_BIT(i))
16473 puDst->au32[i] = puSrc2->au32[i];
16474 else
16475 puDst->au32[i] = puSrc1->au32[i];
16476}
16477
16478
16479/**
16480 * [V]BLENDPD
16481 */
16482IEM_DECL_IMPL_DEF(void, iemAImpl_blendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
16483{
16484 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16485 if (bEvil & RT_BIT(i))
16486 puDst->au64[i] = puSrc->au64[i];
16487}
16488
16489
16490IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
16491{
16492 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16493 if (bEvil & RT_BIT(i))
16494 puDst->au64[i] = puSrc2->au64[i];
16495 else
16496 puDst->au64[i] = puSrc1->au64[i];
16497}
16498
16499
16500IEM_DECL_IMPL_DEF(void, iemAImpl_vblendpd_u256_fallback,(PRTUINT256U puDst, PCRTUINT256U puSrc1, PCRTUINT256U puSrc2, uint8_t bEvil))
16501{
16502 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au64); i++)
16503 if (bEvil & RT_BIT(i))
16504 puDst->au64[i] = puSrc2->au64[i];
16505 else
16506 puDst->au64[i] = puSrc1->au64[i];
16507}
16508
16509
16510/**
16511 * AES tables and helper routines. Tables from Intel AES-NI whitepaper.
16512 */
16513
16514static uint8_t iemAImpl_aes_sbox[] = {
16515 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
16516 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
16517 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
16518 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
16519 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
16520 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
16521 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
16522 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
16523 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
16524 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
16525 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
16526 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
16527 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
16528 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
16529 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
16530 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
16531};
16532
16533/* The InvS-Box lookup table. */
16534static uint8_t iemAImpl_aes_inv_sbox[] = {
16535 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
16536 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
16537 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
16538 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
16539 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
16540 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
16541 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
16542 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
16543 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
16544 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
16545 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
16546 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
16547 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
16548 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
16549 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
16550 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
16551};
16552
16553/* The ShiftRows lookup table. */
16554static uint8_t iemAImpl_aes_shift_rows_tbl[] = {
16555 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11
16556};
16557
16558/* The InvShiftRows lookup table. */
16559static uint8_t iemAImpl_aes_inv_shift_rows_tbl[] = {
16560 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3
16561};
16562
16563static inline RTUINT128U iemAImpl_aes_sub_bytes(PCRTUINT128U puSrc, uint8_t abSubst[256])
16564{
16565 RTUINT128U uVal;
16566 int i;
16567
16568 for (i = 0; i < 16; ++i)
16569 uVal.au8[i] = abSubst[puSrc->au8[i]];
16570
16571 return uVal;
16572}
16573
16574static inline uint8_t iemAImpl_aes_xtime(uint8_t u)
16575{
16576 return (u << 1) ^ (((u >> 7) & 1) * 27);
16577}
16578
16579static RTUINT128U iemAImpl_aes_mix_col(PCRTUINT128U puSrc)
16580{
16581 RTUINT128U uVal;
16582 int i;
16583 uint8_t tmp;
16584
16585 for (i = 0; i < 16; i += 4) {
16586 tmp = puSrc->au8[i+0] ^ puSrc->au8[i+1] ^ puSrc->au8[i+2] ^ puSrc->au8[i+3];
16587 uVal.au8[i+0] = puSrc->au8[i+0] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+0] ^ puSrc->au8[i+1]);
16588 uVal.au8[i+1] = puSrc->au8[i+1] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+1] ^ puSrc->au8[i+2]);
16589 uVal.au8[i+2] = puSrc->au8[i+2] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+2] ^ puSrc->au8[i+3]);
16590 uVal.au8[i+3] = puSrc->au8[i+3] ^ tmp ^ iemAImpl_aes_xtime(puSrc->au8[i+3] ^ puSrc->au8[i+0]);
16591 }
16592
16593 return uVal;
16594}
16595
16596static inline RTUINT128U iemAImpl_aes_shift_rows(PCRTUINT128U puSrc, uint8_t abShift[16])
16597{
16598 RTUINT128U uVal;
16599 int i;
16600
16601 for (i = 0; i < 16; ++i)
16602 uVal.au8[i] = puSrc->au8[abShift[i]];
16603
16604 return uVal;
16605}
16606
16607static uint8_t iemAImpl_aes_clmul(uint8_t a, uint8_t b)
16608{
16609 uint8_t val;
16610
16611 val = ((b >> 0) & 1) * a;
16612 val ^= ((b >> 1) & 1) * iemAImpl_aes_xtime(a);
16613 val ^= ((b >> 2) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(a));
16614 val ^= ((b >> 3) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a)));
16615 val ^= ((b >> 4) & 1) * iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(iemAImpl_aes_xtime(a))));
16616
16617 return val;
16618}
16619
16620static RTUINT128U iemAImpl_aes_inv_mix_col(PCRTUINT128U puSrc)
16621{
16622 RTUINT128U uVal;
16623 int i;
16624
16625 for (i = 0; i < 16; i += 4) {
16626 uVal.au8[i+0] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0b)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x09);
16627 uVal.au8[i+1] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0e)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0d);
16628 uVal.au8[i+2] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0d) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x09)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x0e) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0b);
16629 uVal.au8[i+3] = iemAImpl_aes_clmul(puSrc->au8[i+0], 0x0b) ^ iemAImpl_aes_clmul(puSrc->au8[i+1], 0x0d)^ iemAImpl_aes_clmul(puSrc->au8[i+2], 0x09) ^ iemAImpl_aes_clmul(puSrc->au8[i+3], 0x0e);
16630 }
16631
16632 return uVal;
16633}
16634
16635static inline uint32_t iemAImpl_aes_sub_word(uint32_t w)
16636{
16637 RTUINT32U uTmp;
16638
16639 uTmp.au32[0] = w;
16640 uTmp.au8[0] = iemAImpl_aes_sbox[uTmp.au8[0]];
16641 uTmp.au8[1] = iemAImpl_aes_sbox[uTmp.au8[1]];
16642 uTmp.au8[2] = iemAImpl_aes_sbox[uTmp.au8[2]];
16643 uTmp.au8[3] = iemAImpl_aes_sbox[uTmp.au8[3]];
16644
16645 return uTmp.au32[0];
16646}
16647
16648static inline uint32_t iemAImpl_aes_rot_word(uint32_t w)
16649{
16650 return (w << 24) | (w >> 8);
16651}
16652
16653/**
16654 * [V]AESKEYGENASSIST
16655 */
16656IEM_DECL_IMPL_DEF(void, iemAImpl_aeskeygenassist_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bImm))
16657{
16658 RTUINT128U uTmp;
16659 uint32_t uRCon = bImm; /* Round constant. */
16660
16661 uTmp.au32[0] = iemAImpl_aes_sub_word(puSrc->au32[1]); /* puSrc = KeyGen. */
16662 uTmp.au32[1] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[1])) ^ uRCon;
16663 uTmp.au32[2] = iemAImpl_aes_sub_word(puSrc->au32[3]);
16664 uTmp.au32[3] = iemAImpl_aes_rot_word(iemAImpl_aes_sub_word(puSrc->au32[3])) ^ uRCon;
16665
16666 *puDst = uTmp;
16667}
16668
16669
16670/**
16671 * [V]AESIMC
16672 */
16673IEM_DECL_IMPL_DEF(void, iemAImpl_aesimc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16674{
16675 *puDst = iemAImpl_aes_inv_mix_col(puSrc); /* Src = Key. */
16676}
16677
16678
16679/**
16680 * [V]AESENC
16681 */
16682IEM_DECL_IMPL_DEF(void, iemAImpl_aesenc_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16683{
16684 RTUINT128U uTmp;
16685
16686 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16687 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16688 uTmp = iemAImpl_aes_mix_col(&uTmp);
16689 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16690 uTmp.au64[1] ^= puSrc->au64[1];
16691
16692 *puDst = uTmp;
16693}
16694
16695
16696/**
16697 * [V]AESENCLAST
16698 */
16699IEM_DECL_IMPL_DEF(void, iemAImpl_aesenclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16700{
16701 RTUINT128U uTmp;
16702
16703 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_shift_rows_tbl); /* Dst = state. */
16704 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_sbox);
16705 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16706 uTmp.au64[1] ^= puSrc->au64[1];
16707
16708 *puDst = uTmp;
16709}
16710
16711
16712/**
16713 * [V]AESDEC
16714 */
16715IEM_DECL_IMPL_DEF(void, iemAImpl_aesdec_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16716{
16717 RTUINT128U uTmp;
16718
16719 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16720 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16721 uTmp = iemAImpl_aes_inv_mix_col(&uTmp);
16722 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16723 uTmp.au64[1] ^= puSrc->au64[1];
16724
16725 *puDst = uTmp;
16726}
16727
16728
16729/**
16730 * [V]AESDECLAST
16731 */
16732IEM_DECL_IMPL_DEF(void, iemAImpl_aesdeclast_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
16733{
16734 RTUINT128U uTmp;
16735
16736 uTmp = iemAImpl_aes_shift_rows(puDst, iemAImpl_aes_inv_shift_rows_tbl); /* Dst = state. */
16737 uTmp = iemAImpl_aes_sub_bytes(&uTmp, iemAImpl_aes_inv_sbox);
16738 uTmp.au64[0] ^= puSrc->au64[0]; /* Src = Round Key. */
16739 uTmp.au64[1] ^= puSrc->au64[1];
16740
16741 *puDst = uTmp;
16742}
16743
16744
16745/**
16746 * [V]PCMPISTRI
16747 */
16748
16749/**
16750 * Does the comparisons based on the mode and source input format.
16751 */
16752static void iemAImpl_pcmpxstrx_cmp(bool afCmpRes[16][16], PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bImm)
16753{
16754#define PCMPXSTRX_CMP_CASE(a_fCmpRes, a_puSrc1, a_puSrc2, a_SrcMember, a_bAggOp) \
16755 do \
16756 { \
16757 for (uint8_t idxSrc2 = 0; idxSrc2 < RT_ELEMENTS((a_puSrc2)->a_SrcMember); idxSrc2++) \
16758 for (uint8_t idxSrc1 = 0; idxSrc1 < RT_ELEMENTS((a_puSrc1)->a_SrcMember); idxSrc1 += 2) \
16759 { \
16760 switch (a_bAggOp) \
16761 { \
16762 case 0: \
16763 case 2: \
16764 case 3: \
16765 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
16766 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] == (a_puSrc2)->a_SrcMember[idxSrc2]; \
16767 break; \
16768 case 1: \
16769 afCmpRes[idxSrc2][idxSrc1] = (a_puSrc1)->a_SrcMember[idxSrc1] <= (a_puSrc2)->a_SrcMember[idxSrc2]; \
16770 afCmpRes[idxSrc2][idxSrc1 + 1] = (a_puSrc1)->a_SrcMember[idxSrc1 + 1] >= (a_puSrc2)->a_SrcMember[idxSrc2]; \
16771 break; \
16772 default: \
16773 AssertReleaseFailed(); \
16774 } \
16775 } \
16776 } while(0)
16777
16778 uint8_t bAggOp = (bImm >> 2) & 0x3;
16779 switch (bImm & 0x3)
16780 {
16781 case 0:
16782 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au8, bAggOp);
16783 break;
16784 case 1:
16785 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, au16, bAggOp);
16786 break;
16787 case 2:
16788 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai8, bAggOp);
16789 break;
16790 case 3:
16791 PCMPXSTRX_CMP_CASE(afCmpRes, puSrc1, puSrc2, ai16, bAggOp);
16792 break;
16793 default:
16794 AssertReleaseFailed();
16795 }
16796#undef PCMPXSTRX_CMP_CASE
16797}
16798
16799static uint8_t iemAImpl_pcmpistrx_get_str_len_implicit(PCRTUINT128U puSrc, uint8_t bImm)
16800{
16801 if (bImm & 0x1)
16802 {
16803 /* Words -> 8 elements. */
16804 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au16); i++)
16805 if (puSrc->au16[i] == 0)
16806 return i;
16807
16808 return 8;
16809 }
16810 else
16811 {
16812 /* Bytes -> 16 elements. */
16813 for (uint8_t i = 0; i < RT_ELEMENTS(puSrc->au8); i++)
16814 if (puSrc->au8[i] == 0)
16815 return i;
16816
16817 return 16;
16818 }
16819}
16820
16821static uint8_t iemAImpl_pcmpistrx_get_str_len_explicit(int64_t i64Len, uint8_t bImm)
16822{
16823 if (bImm & 0x1)
16824 {
16825 if (i64Len > -8 && i64Len < 8)
16826 return RT_ABS(i64Len);
16827
16828 return 8;
16829 }
16830 else
16831 {
16832 if (i64Len > -16 && i64Len < 16)
16833 return RT_ABS(i64Len);
16834
16835 return 16;
16836 }
16837}
16838
16839/**
16840 * Valid/Invalid override of comparisons (Table 4-7 from 4.1.6 of SDM).
16841 */
16842static const bool g_afCmpOverride[4][3] =
16843{
16844 /* xmm1 AND xmm2/m128 invalid xmm1 invalid, xmm2/m128 valid xmm1 valid, xmm2/m128 invalid */
16845 { false, false, false }, /* Imm8[3:2] = 00b (equal any) */
16846 { false, false, false }, /* Imm8[3:2] = 01b (ranges) */
16847 { true, false, false }, /* Imm8[3:2] = 10b (equal each) */
16848 { true, true, false }, /* Imm8[3:2] = 11b (equal ordered) */
16849};
16850
16851DECL_FORCE_INLINE(bool) iemAImpl_pcmpxstrx_cmp_override_if_invalid(bool fCmpRes, bool fSrc1Valid, bool fSrc2Valid, uint8_t bAggOp)
16852{
16853 if (fSrc1Valid && fSrc2Valid)
16854 return fCmpRes;
16855
16856 uint8_t bSrc1Valid = fSrc1Valid ? 2 : 0;
16857 uint8_t bSrc2Valid = fSrc2Valid ? 1 : 0;
16858 return g_afCmpOverride[bAggOp][bSrc1Valid + bSrc2Valid];
16859}
16860
16861static uint16_t iemAImpl_pcmpxstrx_cmp_aggregate(bool afCmpRes[16][16], uint8_t idxLen1, uint8_t idxLen2, uint8_t cElems, uint8_t bImm)
16862{
16863 uint8_t bAggOp = (bImm >> 2) & 0x3;
16864 uint16_t u16Result = 0;
16865
16866 switch (bAggOp)
16867 {
16868 case 0: /* Equal any */
16869 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16870 {
16871 uint16_t u16Res = 0;
16872 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1++)
16873 {
16874 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
16875 idxSrc1 < idxLen1,
16876 idxSrc2 < idxLen2,
16877 bAggOp))
16878 {
16879 u16Res = RT_BIT(idxSrc2);
16880 break;
16881 }
16882 }
16883
16884 u16Result |= u16Res;
16885 }
16886 break;
16887
16888 case 1: /* Ranges */
16889 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16890 {
16891 uint16_t u16Res = 0;
16892 for (uint8_t idxSrc1 = 0; idxSrc1 < cElems; idxSrc1 += 2)
16893 {
16894 if ( iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1],
16895 idxSrc1 < idxLen1,
16896 idxSrc2 < idxLen2,
16897 bAggOp)
16898 && iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[idxSrc2][idxSrc1 + 1],
16899 (idxSrc1 + 1) < idxLen1,
16900 idxSrc2 < idxLen2,
16901 bAggOp))
16902 {
16903 u16Res = RT_BIT(idxSrc2);
16904 break;
16905 }
16906 }
16907
16908 u16Result |= u16Res;
16909 }
16910 break;
16911
16912 case 2: /* Equal each */
16913 for (uint8_t i = 0; i < cElems; i++)
16914 {
16915 if (iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[i][i],
16916 i < idxLen1,
16917 i < idxLen2,
16918 bAggOp))
16919 u16Result |= RT_BIT(i);
16920 }
16921 break;
16922
16923 case 3: /* Equal ordered */
16924 u16Result = 0;
16925 for (uint8_t idxSrc2 = 0; idxSrc2 < cElems; idxSrc2++)
16926 {
16927 uint16_t u16Res = RT_BIT(idxSrc2);
16928 for (uint8_t idxSrc1 = 0, k = idxSrc2; (idxSrc1 < (cElems - idxSrc2)) && (k < cElems); idxSrc1++, k++)
16929 {
16930 if (!iemAImpl_pcmpxstrx_cmp_override_if_invalid(afCmpRes[k][idxSrc1],
16931 idxSrc1 < idxLen1,
16932 k < idxLen2,
16933 bAggOp))
16934 {
16935 u16Res = 0;
16936 break;
16937 }
16938 }
16939
16940 u16Result |= u16Res;
16941 }
16942 break;
16943 }
16944
16945 /* Polarity selection. */
16946 switch ((bImm >> 4) & 0x3)
16947 {
16948 case 0:
16949 case 2:
16950 /* Nothing to do. */
16951 break;
16952 case 1:
16953 u16Result = (cElems == 8 ? 0xff : 0xffff) ^ u16Result;
16954 break;
16955 case 3:
16956 u16Result ^= RT_BIT(idxLen2) - 1;
16957 break;
16958 default:
16959 AssertReleaseFailed();
16960 }
16961
16962 return u16Result;
16963}
16964
16965DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrx_set_eflags(uint32_t *pfEFlags, uint16_t u16Result, uint8_t cLen1, uint8_t cLen2, uint8_t cElems)
16966{
16967 uint32_t fEFlags = 0;
16968
16969 if (u16Result)
16970 fEFlags |= X86_EFL_CF;
16971 if (cLen2 < cElems)
16972 fEFlags |= X86_EFL_ZF;
16973 if (cLen1 < cElems)
16974 fEFlags |= X86_EFL_SF;
16975 if (u16Result & 0x1)
16976 fEFlags |= X86_EFL_OF;
16977 *pfEFlags = (*pfEFlags & ~X86_EFL_STATUS_BITS) | fEFlags;
16978}
16979
16980DECL_FORCE_INLINE(uint16_t) iemAImpl_pcmpxstrx_worker(uint32_t *pEFlags, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2,
16981 uint8_t cLen1, uint8_t cLen2, uint8_t bEvil)
16982{
16983 bool afCmpRes[16][16];
16984 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
16985
16986 iemAImpl_pcmpxstrx_cmp(afCmpRes, puSrc1, puSrc2, bEvil);
16987 uint16_t u16Result = iemAImpl_pcmpxstrx_cmp_aggregate(afCmpRes, cLen1, cLen2, cElems, bEvil);
16988 iemAImpl_pcmpxstrx_set_eflags(pEFlags, u16Result, cLen1, cLen2, cElems);
16989
16990 return u16Result;
16991}
16992
16993DECL_FORCE_INLINE(void) iemAImpl_pcmpxstri_set_result_index(uint32_t *pu32Ecx, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
16994{
16995 if (bImm & RT_BIT(6))
16996 {
16997 /* Index for MSB set. */
16998 uint32_t idxMsb = ASMBitLastSetU16(u16Result);
16999 if (idxMsb)
17000 *pu32Ecx = idxMsb - 1;
17001 else
17002 *pu32Ecx = cElems;
17003 }
17004 else
17005 {
17006 /* Index for LSB set. */
17007 uint32_t idxLsb = ASMBitFirstSetU16(u16Result);
17008 if (idxLsb)
17009 *pu32Ecx = idxLsb - 1;
17010 else
17011 *pu32Ecx = cElems;
17012 }
17013}
17014
17015IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17016{
17017 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17018 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17019 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17020
17021 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17022 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17023}
17024
17025
17026/**
17027 * [V]PCMPESTRI
17028 */
17029IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestri_u128_fallback,(uint32_t *pu32Ecx, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17030{
17031 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17032 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17033 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17034
17035 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17036 iemAImpl_pcmpxstri_set_result_index(pu32Ecx, u16Result, cElems, bEvil);
17037}
17038
17039
17040/**
17041 * [V]PCMPISTRM
17042 */
17043DECL_FORCE_INLINE(void) iemAImpl_pcmpxstrm_set_result_mask(PRTUINT128U puDst, uint16_t u16Result, uint8_t cElems, uint8_t bImm)
17044{
17045 if (bImm & RT_BIT(6))
17046 {
17047 /* Generate a mask. */
17048 if (cElems == 8)
17049 {
17050 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
17051 if (u16Result & RT_BIT(i))
17052 puDst->au16[i] = 0xffff;
17053 else
17054 puDst->au16[i] = 0;
17055 }
17056 else
17057 {
17058 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au8); i++)
17059 if (u16Result & RT_BIT(i))
17060 puDst->au8[i] = 0xff;
17061 else
17062 puDst->au8[i] = 0;
17063 }
17064 }
17065 else
17066 {
17067 /* Store the result. */
17068 puDst->au64[0] = u16Result;
17069 puDst->au64[1] = 0;
17070 }
17071}
17072
17073IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpistrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPISTRXSRC pSrc, uint8_t bEvil))
17074{
17075 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17076 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc1, bEvil);
17077 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_implicit(&pSrc->uSrc2, bEvil);
17078
17079 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17080 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
17081}
17082
17083
17084/**
17085 * [V]PCMPESTRM
17086 */
17087IEM_DECL_IMPL_DEF(void, iemAImpl_pcmpestrm_u128_fallback,(PRTUINT128U puDst, uint32_t *pEFlags, PCIEMPCMPESTRXSRC pSrc, uint8_t bEvil))
17088{
17089 uint8_t cElems = (bEvil & RT_BIT(0)) ? 8 : 16;
17090 uint8_t cLen1 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rax, bEvil);
17091 uint8_t cLen2 = iemAImpl_pcmpistrx_get_str_len_explicit((int64_t)pSrc->u64Rdx, bEvil);
17092
17093 uint16_t u16Result = iemAImpl_pcmpxstrx_worker(pEFlags, &pSrc->uSrc1, &pSrc->uSrc2, cLen1, cLen2, bEvil);
17094 iemAImpl_pcmpxstrm_set_result_mask(puDst, u16Result, cElems, bEvil);
17095}
17096
17097
17098/*
17099 * [V]PCLMULQDQ
17100 */
17101IEM_DECL_IMPL_DEF(void, iemAImpl_pclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
17102{
17103 iemAImpl_vpclmulqdq_u128_fallback(puDst, puDst, puSrc, bEvil);
17104}
17105
17106
17107IEM_DECL_IMPL_DEF(void, iemAImpl_vpclmulqdq_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc1, PCRTUINT128U puSrc2, uint8_t bEvil))
17108{
17109 uint64_t uSrc1 = puSrc1->au64[bEvil & 0x1];
17110 uint64_t uSrc2 = puSrc2->au64[(bEvil >> 4) & 0x1];
17111
17112 puDst->au64[0] = 0;
17113 puDst->au64[1] = 0;
17114
17115 /*
17116 * See https://en.wikipedia.org/wiki/Carry-less_product#Example (as of 2022-09-08) for the algorithm.
17117 * Do the first round outside the loop to avoid ASAN complaining about shift exponent being too large (64)
17118 * and squeeze out some optimizations.
17119 */
17120 if (uSrc1 & 0x1)
17121 puDst->au64[0] = uSrc2;
17122
17123 uSrc1 >>= 1;
17124
17125 uint8_t iDigit = 1;
17126 while (uSrc1)
17127 {
17128 if (uSrc1 & 0x1)
17129 {
17130 puDst->au64[0] ^= (uSrc2 << iDigit);
17131 puDst->au64[1] ^= uSrc2 >> (64 - iDigit);
17132 }
17133
17134 uSrc1 >>= 1;
17135 iDigit++;
17136 }
17137}
17138
17139
17140/**
17141 * [V]PINSRW
17142 */
17143#ifdef IEM_WITHOUT_ASSEMBLY
17144IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u64,(uint64_t *pu64Dst, uint16_t u16Src, uint8_t bEvil))
17145{
17146 uint8_t cShift = (bEvil & 0x3) * 16;
17147 *pu64Dst = (*pu64Dst & ~(UINT64_C(0xffff) << cShift)) | ((uint64_t)u16Src << cShift);
17148}
17149
17150
17151IEM_DECL_IMPL_DEF(void, iemAImpl_pinsrw_u128,(PRTUINT128U puDst, uint16_t u16Src, uint8_t bEvil))
17152{
17153 puDst->au16[bEvil & 0x7] = u16Src;
17154}
17155#endif
17156
17157
17158IEM_DECL_IMPL_DEF(void, iemAImpl_vpinsrw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint16_t u16Src, uint8_t bEvil))
17159{
17160 *puDst = *puSrc;
17161 puDst->au16[bEvil & 0x7] = u16Src;
17162}
17163
17164
17165/**
17166 * [V]PEXTRW
17167 */
17168#ifdef IEM_WITHOUT_ASSEMBLY
17169IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u64,(uint16_t *pu16Dst, uint64_t u64Src, uint8_t bEvil))
17170{
17171 *pu16Dst = (uint16_t)(u64Src >> ((bEvil & 0x3) * 16));
17172}
17173
17174
17175IEM_DECL_IMPL_DEF(void, iemAImpl_pextrw_u128,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
17176{
17177 *pu16Dst = puSrc->au16[bEvil & 0x7];
17178}
17179
17180#endif
17181
17182IEM_DECL_IMPL_DEF(void, iemAImpl_vpextrw_u128_fallback,(uint16_t *pu16Dst, PCRTUINT128U puSrc, uint8_t bEvil))
17183{
17184 *pu16Dst = puSrc->au16[bEvil & 0x7];
17185}
17186
17187
17188/**
17189 * [V]MOVMSKPS
17190 */
17191#ifdef IEM_WITHOUT_ASSEMBLY
17192IEM_DECL_IMPL_DEF(void, iemAImpl_movmskps_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17193{
17194 *pu8Dst = puSrc->au32[0] >> 31;
17195 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17196 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17197 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17198}
17199
17200#endif
17201
17202IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17203{
17204 *pu8Dst = puSrc->au32[0] >> 31;
17205 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17206 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17207 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17208}
17209
17210
17211IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskps_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17212{
17213 *pu8Dst = puSrc->au32[0] >> 31;
17214 *pu8Dst |= (puSrc->au32[1] >> 31) << 1;
17215 *pu8Dst |= (puSrc->au32[2] >> 31) << 2;
17216 *pu8Dst |= (puSrc->au32[3] >> 31) << 3;
17217 *pu8Dst |= (puSrc->au32[4] >> 31) << 4;
17218 *pu8Dst |= (puSrc->au32[5] >> 31) << 5;
17219 *pu8Dst |= (puSrc->au32[6] >> 31) << 6;
17220 *pu8Dst |= (puSrc->au32[7] >> 31) << 7;
17221}
17222
17223
17224/**
17225 * [V]MOVMSKPD
17226 */
17227#ifdef IEM_WITHOUT_ASSEMBLY
17228IEM_DECL_IMPL_DEF(void, iemAImpl_movmskpd_u128,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17229{
17230 *pu8Dst = puSrc->au64[0] >> 63;
17231 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17232}
17233
17234#endif
17235
17236IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u128_fallback,(uint8_t *pu8Dst, PCRTUINT128U puSrc))
17237{
17238 *pu8Dst = puSrc->au64[0] >> 63;
17239 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17240}
17241
17242
17243IEM_DECL_IMPL_DEF(void, iemAImpl_vmovmskpd_u256_fallback,(uint8_t *pu8Dst, PCRTUINT256U puSrc))
17244{
17245 *pu8Dst = puSrc->au64[0] >> 63;
17246 *pu8Dst |= (puSrc->au64[1] >> 63) << 1;
17247 *pu8Dst |= (puSrc->au64[2] >> 63) << 2;
17248 *pu8Dst |= (puSrc->au64[3] >> 63) << 3;
17249}
17250
17251
17252/**
17253 * CVTTSD2SI
17254 */
17255#ifdef IEM_WITHOUT_ASSEMBLY
17256IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17257{
17258 RTFLOAT64U r64Src;
17259
17260 r64Src.u = *pu64Src;
17261 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17262
17263 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17264 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17265 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17266}
17267
17268
17269IEM_DECL_IMPL_DEF(void, iemAImpl_cvttsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17270{
17271 RTFLOAT64U r64Src;
17272
17273 r64Src.u = *pu64Src;
17274 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17275
17276 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17277 *pi64Dst = f64_to_i64_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17278 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17279}
17280#endif
17281
17282
17283/**
17284 * CVTSD2SI
17285 */
17286#ifdef IEM_WITHOUT_ASSEMBLY
17287IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i32_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint64_t *pu64Src))
17288{
17289 RTFLOAT64U r64Src;
17290
17291 r64Src.u = *pu64Src;
17292 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17293
17294 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17295 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17296 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17297}
17298
17299
17300IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsd2si_i64_r64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint64_t *pu64Src))
17301{
17302 RTFLOAT64U r64Src;
17303
17304 r64Src.u = *pu64Src;
17305 iemSsePrepareValueR64(&r64Src, pFpuState->MXCSR, &r64Src); /* The de-normal flag is not set. */
17306
17307 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17308 *pi64Dst = f64_to_i64(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17309 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17310}
17311#endif
17312
17313
17314/**
17315 * CVTTSS2SI
17316 */
17317#ifdef IEM_WITHOUT_ASSEMBLY
17318IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17319{
17320 RTFLOAT32U r32Src;
17321
17322 r32Src.u = *pu32Src;
17323 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17324
17325 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17326 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17327 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17328}
17329
17330
17331IEM_DECL_IMPL_DEF(void, iemAImpl_cvttss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17332{
17333 RTFLOAT32U r32Src;
17334
17335 r32Src.u = *pu32Src;
17336 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17337
17338 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17339 *pi64Dst = f32_to_i64_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17340 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17341}
17342#endif
17343
17344
17345/**
17346 * CVTSS2SI
17347 */
17348#ifdef IEM_WITHOUT_ASSEMBLY
17349IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i32_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int32_t *pi32Dst, const uint32_t *pu32Src))
17350{
17351 RTFLOAT32U r32Src;
17352
17353 r32Src.u = *pu32Src;
17354 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17355
17356 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17357 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17358 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17359}
17360
17361
17362IEM_DECL_IMPL_DEF(void, iemAImpl_cvtss2si_i64_r32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, int64_t *pi64Dst, const uint32_t *pu32Src))
17363{
17364 RTFLOAT32U r32Src;
17365
17366 r32Src.u = *pu32Src;
17367 iemSsePrepareValueR32(&r32Src, pFpuState->MXCSR, &r32Src); /* The de-normal flag is not set. */
17368
17369 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17370 *pi64Dst = f32_to_i64(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17371 *pfMxcsr = pFpuState->MXCSR | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17372}
17373#endif
17374
17375
17376/**
17377 * CVTSI2SD
17378 */
17379#ifdef IEM_WITHOUT_ASSEMBLY
17380IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int32_t *pi32Src))
17381{
17382 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17383 float64_t r64Res = i32_to_f64(*pi32Src, &SoftState);
17384 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17385}
17386
17387
17388IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2sd_r64_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT64U pr64Dst, const int64_t *pi64Src))
17389{
17390 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17391 float64_t r64Res = i64_to_f64(*pi64Src, &SoftState);
17392 *pfMxcsr = iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, pFpuState->MXCSR);
17393}
17394#endif
17395
17396
17397/**
17398 * CVTSI2SS
17399 */
17400#ifdef IEM_WITHOUT_ASSEMBLY
17401IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i32,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int32_t *pi32Src))
17402{
17403 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17404 float32_t r32Res = i32_to_f32(*pi32Src, &SoftState);
17405 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17406}
17407
17408
17409IEM_DECL_IMPL_DEF(void, iemAImpl_cvtsi2ss_r32_i64,(PCX86FXSTATE pFpuState, uint32_t *pfMxcsr, PRTFLOAT32U pr32Dst, const int64_t *pi64Src))
17410{
17411 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(pFpuState->MXCSR);
17412 float32_t r32Res = i64_to_f32(*pi64Src, &SoftState);
17413 *pfMxcsr = iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, pFpuState->MXCSR);
17414}
17415#endif
17416
17417
17418/**
17419 * [V]UCOMISS
17420 */
17421#ifdef IEM_WITHOUT_ASSEMBLY
17422IEM_DECL_IMPL_DEF(void, iemAImpl_ucomiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17423{
17424 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17425
17426 if (RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0]))
17427 {
17428 *pfMxcsr |= X86_MXCSR_IE;
17429 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17430 }
17431 else if (RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17432 {
17433 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17434 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17435 }
17436 else
17437 {
17438 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17439
17440 RTFLOAT32U r32Src1, r32Src2;
17441 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17442 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17443
17444 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17445 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17446 if (f32_eq(f32Src1, f32Src2, &SoftState))
17447 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17448 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17449 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17450 /* else: GREATER_THAN 000 */
17451
17452 *pfMxcsr |= fDe;
17453 }
17454
17455 *pfEFlags = fEFlagsNew;
17456}
17457#endif
17458
17459IEM_DECL_IMPL_DEF(void, iemAImpl_vucomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17460{
17461 iemAImpl_ucomiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17462}
17463
17464
17465/**
17466 * [V]UCOMISD
17467 */
17468#ifdef IEM_WITHOUT_ASSEMBLY
17469IEM_DECL_IMPL_DEF(void, iemAImpl_ucomisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17470{
17471 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17472
17473 if (RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0]))
17474 {
17475 *pfMxcsr |= X86_MXCSR_IE;
17476 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17477 }
17478 else if (RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17479 {
17480 /* ucomiss doesn't raise \#IE for quiet NaNs. */
17481 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17482 }
17483 else
17484 {
17485 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17486
17487 RTFLOAT64U r64Src1, r64Src2;
17488 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17489 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17490
17491 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17492 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17493 if (f64_eq(f64Src1, f64Src2, &SoftState))
17494 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17495 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17496 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17497 /* else: GREATER_THAN 000 */
17498
17499 *pfMxcsr |= fDe;
17500 }
17501
17502 *pfEFlags = fEFlagsNew;
17503}
17504#endif
17505
17506IEM_DECL_IMPL_DEF(void, iemAImpl_vucomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17507{
17508 iemAImpl_ucomisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17509}
17510
17511
17512/**
17513 * [V]COMISS
17514 */
17515#ifdef IEM_WITHOUT_ASSEMBLY
17516IEM_DECL_IMPL_DEF(void, iemAImpl_comiss_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17517{
17518 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17519
17520 if ( RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_SIGNALLING_NAN(&puSrc2->ar32[0])
17521 || RTFLOAT32U_IS_QUIET_NAN(&puSrc1->ar32[0]) || RTFLOAT32U_IS_QUIET_NAN(&puSrc2->ar32[0]))
17522 {
17523 *pfMxcsr |= X86_MXCSR_IE;
17524 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17525 }
17526 else
17527 {
17528 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17529
17530 RTFLOAT32U r32Src1, r32Src2;
17531 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, &puSrc1->ar32[0]);
17532 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, &puSrc2->ar32[0]);
17533
17534 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17535 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17536 if (f32_eq(f32Src1, f32Src2, &SoftState))
17537 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17538 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17539 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17540 /* else: GREATER_THAN 000 */
17541
17542 *pfMxcsr |= fDe;
17543 }
17544
17545 *pfEFlags = fEFlagsNew;
17546}
17547#endif
17548
17549
17550IEM_DECL_IMPL_DEF(void, iemAImpl_vcomiss_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17551{
17552 iemAImpl_comiss_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17553}
17554
17555
17556/**
17557 * [V]COMISD
17558 */
17559#ifdef IEM_WITHOUT_ASSEMBLY
17560IEM_DECL_IMPL_DEF(void, iemAImpl_comisd_u128,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17561{
17562 uint32_t fEFlagsNew = *pfEFlags & ~X86_EFL_STATUS_BITS;
17563
17564 if ( RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_SIGNALLING_NAN(&puSrc2->ar64[0])
17565 || RTFLOAT64U_IS_QUIET_NAN(&puSrc1->ar64[0]) || RTFLOAT64U_IS_QUIET_NAN(&puSrc2->ar64[0]))
17566 {
17567 *pfMxcsr |= X86_MXCSR_IE;
17568 fEFlagsNew |= X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF; /* UNORDERED 111 */
17569 }
17570 else
17571 {
17572 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17573
17574 RTFLOAT64U r64Src1, r64Src2;
17575 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, &puSrc1->ar64[0]);
17576 fDe |= iemSsePrepareValueR64(&r64Src2, *pfMxcsr, &puSrc2->ar64[0]);
17577
17578 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17579 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17580 if (f64_eq(f64Src1, f64Src2, &SoftState))
17581 fEFlagsNew |= X86_EFL_ZF; /* EQUAL 100 */
17582 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17583 fEFlagsNew |= X86_EFL_CF; /* LESS_THAN 001 */
17584 /* else: GREATER_THAN 000 */
17585
17586 *pfMxcsr |= fDe;
17587 }
17588
17589 *pfEFlags = fEFlagsNew;
17590}
17591#endif
17592
17593IEM_DECL_IMPL_DEF(void, iemAImpl_vcomisd_u128_fallback,(uint32_t *pfMxcsr, uint32_t *pfEFlags, PCX86XMMREG puSrc1, PCX86XMMREG puSrc2))
17594{
17595 iemAImpl_comisd_u128(pfMxcsr, pfEFlags, puSrc1, puSrc2);
17596}
17597
17598
17599/**
17600 * CMPPS / CMPPD / CMPSS / CMPSD
17601 */
17602#ifdef IEM_WITHOUT_ASSEMBLY
17603/**
17604 * A compare truth table entry.
17605 */
17606typedef struct CMPTRUTHTBLENTRY
17607{
17608 /** Flag whether the \#IA is signalled when one of the source oeprans is a QNaN */
17609 bool fSignalsOnQNan;
17610 /** The boolean result when the input operands are unordered. */
17611 bool fUnordered;
17612 /** The boolean result when A = B. */
17613 bool fEqual;
17614 /** The boolean result when A < B. */
17615 bool fLowerThan;
17616 /** The boolean result when A > B. */
17617 bool fGreaterThan;
17618} CMPTRUTHTBLENTRY;
17619/** Pointer to a const truth table entry. */
17620typedef const CMPTRUTHTBLENTRY *PCCMPTRUTHTBLENTRY;
17621
17622
17623/** The compare truth table (indexed by immediate). */
17624static const CMPTRUTHTBLENTRY g_aCmpTbl[] =
17625{
17626 /* fSignalsOnQNan fUnordered fEqual fLowerThan fGreaterThan */
17627 /* 00H (EQ_OQ) */ { false, false, true, false, false },
17628 /* 01H (LT_OS) */ { true, false, false, true, false },
17629 /* 02H (LE_OS) */ { true, false, true, true, false },
17630 /* 03H (UNORD_Q) */ { false, true, false, false, false },
17631 /* 04H (NEQ_UQ) */ { false, true, false, true, true },
17632 /* 05H (NLT_US) */ { true, true, true, false, true },
17633 /* 06H (NLE_US) */ { true, true, false, false, true },
17634 /* 07H (ORQ_Q) */ { false, false, true, true, true },
17635 /** @todo AVX variants. */
17636};
17637
17638
17639static bool iemAImpl_cmp_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src1, PCRTFLOAT32U pr32Src2, uint8_t bEvil)
17640{
17641 bool fRes;
17642 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17643
17644 if (RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src1) || RTFLOAT32U_IS_SIGNALLING_NAN(pr32Src2))
17645 {
17646 *pfMxcsr |= X86_MXCSR_IE;
17647 fRes = g_aCmpTbl[bEvil].fUnordered;
17648 }
17649 else if (RTFLOAT32U_IS_QUIET_NAN(pr32Src1) || RTFLOAT32U_IS_QUIET_NAN(pr32Src2))
17650 {
17651 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17652 *pfMxcsr |= X86_MXCSR_IE;
17653 fRes = g_aCmpTbl[bEvil].fUnordered;
17654 }
17655 else
17656 {
17657 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17658
17659 RTFLOAT32U r32Src1, r32Src2;
17660 uint32_t fDe = iemSsePrepareValueR32(&r32Src1, *pfMxcsr, pr32Src1);
17661 fDe |= iemSsePrepareValueR32(&r32Src2, *pfMxcsr, pr32Src2);
17662
17663 *pfMxcsr |= fDe;
17664 float32_t f32Src1 = iemFpSoftF32FromIprt(&r32Src1);
17665 float32_t f32Src2 = iemFpSoftF32FromIprt(&r32Src2);
17666 if (f32_eq(f32Src1, f32Src2, &SoftState))
17667 fRes = g_aCmpTbl[bEvil].fEqual;
17668 else if (f32_lt(f32Src1, f32Src2, &SoftState))
17669 fRes = g_aCmpTbl[bEvil].fLowerThan;
17670 else
17671 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17672 }
17673
17674 return fRes;
17675}
17676
17677
17678static bool iemAImpl_cmp_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src1, PCRTFLOAT64U pr64Src2, uint8_t bEvil)
17679{
17680 bool fRes;
17681 AssertRelease(bEvil < RT_ELEMENTS(g_aCmpTbl));
17682
17683 if (RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src1) || RTFLOAT64U_IS_SIGNALLING_NAN(pr64Src2))
17684 {
17685 *pfMxcsr |= X86_MXCSR_IE;
17686 fRes = g_aCmpTbl[bEvil].fUnordered;
17687 }
17688 else if (RTFLOAT64U_IS_QUIET_NAN(pr64Src1) || RTFLOAT64U_IS_QUIET_NAN(pr64Src2))
17689 {
17690 if (g_aCmpTbl[bEvil].fSignalsOnQNan)
17691 *pfMxcsr |= X86_MXCSR_IE;
17692 fRes = g_aCmpTbl[bEvil].fUnordered;
17693 }
17694 else
17695 {
17696 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(*pfMxcsr);
17697
17698 RTFLOAT64U r64Src1, r64Src2;
17699 uint32_t fDe = iemSsePrepareValueR64(&r64Src1, *pfMxcsr, pr64Src1)
17700 | iemSsePrepareValueR64(&r64Src2, *pfMxcsr, pr64Src2);
17701
17702 *pfMxcsr |= fDe;
17703 float64_t f64Src1 = iemFpSoftF64FromIprt(&r64Src1);
17704 float64_t f64Src2 = iemFpSoftF64FromIprt(&r64Src2);
17705 if (f64_eq(f64Src1, f64Src2, &SoftState))
17706 fRes = g_aCmpTbl[bEvil].fEqual;
17707 else if (f64_lt(f64Src1, f64Src2, &SoftState))
17708 fRes = g_aCmpTbl[bEvil].fLowerThan;
17709 else
17710 fRes = g_aCmpTbl[bEvil].fGreaterThan;
17711 }
17712
17713 return fRes;
17714}
17715
17716
17717IEM_DECL_IMPL_DEF(void, iemAImpl_cmpps_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17718{
17719 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17720 {
17721 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[i], &pSrc->uSrc2.ar32[i], bEvil & 0x7))
17722 puDst->au32[i] = UINT32_MAX;
17723 else
17724 puDst->au32[i] = 0;
17725 }
17726}
17727
17728
17729IEM_DECL_IMPL_DEF(void, iemAImpl_cmppd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17730{
17731 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17732 {
17733 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[i], &pSrc->uSrc2.ar64[i], bEvil & 0x7))
17734 puDst->au64[i] = UINT64_MAX;
17735 else
17736 puDst->au64[i] = 0;
17737 }
17738}
17739
17740
17741IEM_DECL_IMPL_DEF(void, iemAImpl_cmpss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17742{
17743 if (iemAImpl_cmp_worker_r32(pfMxcsr, &pSrc->uSrc1.ar32[0], &pSrc->uSrc2.ar32[0], bEvil & 0x7))
17744 puDst->au32[0] = UINT32_MAX;
17745 else
17746 puDst->au32[0] = 0;
17747
17748 puDst->au32[1] = pSrc->uSrc1.au32[1];
17749 puDst->au64[1] = pSrc->uSrc1.au64[1];
17750}
17751
17752
17753IEM_DECL_IMPL_DEF(void, iemAImpl_cmpsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bEvil))
17754{
17755 if (iemAImpl_cmp_worker_r64(pfMxcsr, &pSrc->uSrc1.ar64[0], &pSrc->uSrc2.ar64[0], bEvil & 0x7))
17756 puDst->au64[0] = UINT64_MAX;
17757 else
17758 puDst->au64[0] = 0;
17759
17760 puDst->au64[1] = pSrc->uSrc1.au64[1];
17761}
17762#endif
17763
17764
17765/**
17766 * ROUNDPS / ROUNDPD / ROUNDSS / ROUNDSD
17767 */
17768
17769#define X86_SSE_ROUNDXX_IMM_RC_MASK UINT8_C(0x03)
17770#define X86_SSE_ROUNDXX_IMM_ROUND_SEL UINT8_C(0x04)
17771#define X86_SSE_ROUNDXX_IMM_PRECISION UINT8_C(0x08)
17772
17773#define X86_SSE_ROUNDXX_IMM_MASK UINT8_C(0x0F)
17774
17775DECLINLINE(softfloat_state_t) iemSseRoundXXMxcsrAndImmToSoftState(uint32_t fMxcsr, uint8_t bImm)
17776{
17777 if (bImm & X86_SSE_ROUNDXX_IMM_ROUND_SEL)
17778 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17779
17780 fMxcsr &= ~X86_MXCSR_RC_MASK;
17781 fMxcsr |= (bImm & X86_SSE_ROUNDXX_IMM_RC_MASK) << X86_MXCSR_RC_SHIFT;
17782 return IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17783}
17784
17785static RTFLOAT32U iemAImpl_round_worker_r32(uint32_t *pfMxcsr, PCRTFLOAT32U pr32Src, uint8_t bImm)
17786{
17787 RTFLOAT32U r32Src, r32Dst;
17788 float32_t f32Src;
17789 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
17790 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
17791
17792 iemSsePrepareValueR32(&r32Src, *pfMxcsr, pr32Src);
17793 f32Src = f32_roundToInt(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, fExact, &SoftState);
17794
17795 iemFpSoftF32ToIprt(&r32Dst, f32Src);
17796 return r32Dst;
17797}
17798
17799static RTFLOAT64U iemAImpl_round_worker_r64(uint32_t *pfMxcsr, PCRTFLOAT64U pr64Src, uint8_t bImm)
17800{
17801 RTFLOAT64U r64Src, r64Dst;
17802 float64_t f64Src;
17803 softfloat_state_t SoftState = iemSseRoundXXMxcsrAndImmToSoftState(*pfMxcsr, bImm);
17804 bool fExact = !RT_BOOL(bImm & X86_SSE_ROUNDXX_IMM_PRECISION);
17805
17806 iemSsePrepareValueR64(&r64Src, *pfMxcsr, pr64Src);
17807 f64Src = f64_roundToInt(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, fExact, &SoftState);
17808
17809 iemFpSoftF64ToIprt(&r64Dst, f64Src);
17810 return r64Dst;
17811}
17812
17813#ifdef IEM_WITHOUT_ASSEMBLY
17814IEM_DECL_IMPL_DEF(void, iemAImpl_roundss_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17815{
17816 puDst->ar32[0] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17817 puDst->au32[1] = pSrc->uSrc1.au32[1];
17818 puDst->au64[1] = pSrc->uSrc1.au64[1];
17819}
17820
17821
17822IEM_DECL_IMPL_DEF(void, iemAImpl_roundsd_u128,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17823{
17824 puDst->ar64[0] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[0], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17825 puDst->au64[1] = pSrc->uSrc1.au64[1];
17826}
17827#endif
17828
17829IEM_DECL_IMPL_DEF(void, iemAImpl_roundps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17830{
17831 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar32); i++)
17832 {
17833 puDst->ar32[i] = iemAImpl_round_worker_r32(pfMxcsr, &pSrc->uSrc2.ar32[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17834 }
17835}
17836
17837
17838IEM_DECL_IMPL_DEF(void, iemAImpl_roundpd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
17839{
17840 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->ar64); i++)
17841 {
17842 puDst->ar64[i] = iemAImpl_round_worker_r64(pfMxcsr, &pSrc->uSrc2.ar64[i], bImm & X86_SSE_ROUNDXX_IMM_MASK);
17843 }
17844}
17845
17846/**
17847 * CVTPD2PI
17848 */
17849#ifdef IEM_WITHOUT_ASSEMBLY
17850static uint32_t iemAImpl_cvtpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17851{
17852 RTFLOAT64U r64Src;
17853 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17854
17855 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17856 *pi32Dst = f64_to_i32(iemFpSoftF64FromIprt(&r64Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17857 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17858}
17859
17860
17861IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17862{
17863 RTUINT64U u64Res;
17864 uint32_t fMxcsrOut = iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17865 fMxcsrOut |= iemAImpl_cvtpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17866
17867 *pu64Dst = u64Res.u;
17868 *pfMxcsr = fMxcsrOut;
17869}
17870#endif
17871
17872
17873/**
17874 * CVTTPD2PI
17875 */
17876#ifdef IEM_WITHOUT_ASSEMBLY
17877static uint32_t iemAImpl_cvttpd2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT64U pr64Src)
17878{
17879 RTFLOAT64U r64Src;
17880 iemSsePrepareValueR64(&r64Src, fMxcsr, pr64Src); /* The de-normal flag is not set. */
17881
17882 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17883 *pi32Dst = f64_to_i32_r_minMag(iemFpSoftF64FromIprt(&r64Src), true /*exact*/, &SoftState);
17884 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17885}
17886
17887
17888IEM_DECL_IMPL_DEF(void, iemAImpl_cvttpd2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, PCX86XMMREG pSrc))
17889{
17890 RTUINT64U u64Res;
17891 uint32_t fMxcsrOut = iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[0], &pSrc->ar64[0]);
17892 fMxcsrOut |= iemAImpl_cvttpd2pi_u128_worker(*pfMxcsr, &u64Res.ai32[1], &pSrc->ar64[1]);
17893
17894 *pu64Dst = u64Res.u;
17895 *pfMxcsr = fMxcsrOut;
17896}
17897#endif
17898
17899
17900/**
17901 * CVTPI2PS
17902 */
17903#ifdef IEM_WITHOUT_ASSEMBLY
17904static uint32_t iemAImpl_cvtpi2ps_u128_worker(uint32_t fMxcsr, PRTFLOAT32U pr32Dst, int32_t i32Src)
17905{
17906 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17907 float32_t r32Res = i32_to_f32(i32Src, &SoftState);
17908 return iemSseSoftStateAndR32ToMxcsrAndIprtResult(&SoftState, r32Res, pr32Dst, fMxcsr);
17909}
17910
17911
17912IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2ps_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17913{
17914 RTUINT64U uSrc = { u64Src };
17915 uint32_t fMxcsrOut = iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[0], uSrc.ai32[0]);
17916 fMxcsrOut |= iemAImpl_cvtpi2ps_u128_worker(*pfMxcsr, &pDst->ar32[1], uSrc.ai32[1]);
17917 *pfMxcsr = fMxcsrOut;
17918}
17919#endif
17920
17921
17922/**
17923 * CVTPI2PD
17924 */
17925#ifdef IEM_WITHOUT_ASSEMBLY
17926static uint32_t iemAImpl_cvtpi2pd_u128_worker(uint32_t fMxcsr, PRTFLOAT64U pr64Dst, int32_t i32Src)
17927{
17928 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17929 float64_t r64Res = i32_to_f64(i32Src, &SoftState);
17930 return iemSseSoftStateAndR64ToMxcsrAndIprtResult(&SoftState, r64Res, pr64Dst, fMxcsr);
17931}
17932
17933
17934IEM_DECL_IMPL_DEF(void, iemAImpl_cvtpi2pd_u128,(uint32_t *pfMxcsr, PX86XMMREG pDst, uint64_t u64Src))
17935{
17936 RTUINT64U uSrc = { u64Src };
17937 uint32_t fMxcsrOut = iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[0], uSrc.ai32[0]);
17938 fMxcsrOut |= iemAImpl_cvtpi2pd_u128_worker(*pfMxcsr, &pDst->ar64[1], uSrc.ai32[1]);
17939 *pfMxcsr = fMxcsrOut;
17940}
17941#endif
17942
17943
17944/**
17945 * CVTPS2PI
17946 */
17947#ifdef IEM_WITHOUT_ASSEMBLY
17948static uint32_t iemAImpl_cvtps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17949{
17950 RTFLOAT32U r32Src;
17951 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17952
17953 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17954 *pi32Dst = f32_to_i32(iemFpSoftF32FromIprt(&r32Src), SoftState.roundingMode, true /*exact*/, &SoftState);
17955 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17956}
17957
17958
17959IEM_DECL_IMPL_DEF(void, iemAImpl_cvtps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17960{
17961 RTUINT64U uDst;
17962 RTUINT64U uSrc = { u64Src };
17963 uint32_t fMxcsrOut = iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17964 fMxcsrOut |= iemAImpl_cvtps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17965 *pu64Dst = uDst.u;
17966 *pfMxcsr = fMxcsrOut;
17967}
17968#endif
17969
17970
17971/**
17972 * CVTTPS2PI
17973 */
17974#ifdef IEM_WITHOUT_ASSEMBLY
17975static uint32_t iemAImpl_cvttps2pi_u128_worker(uint32_t fMxcsr, int32_t *pi32Dst, PCRTFLOAT32U pr32Src)
17976{
17977 RTFLOAT32U r32Src;
17978 iemSsePrepareValueR32(&r32Src, fMxcsr, pr32Src); /* The de-normal flag is not set. */
17979
17980 softfloat_state_t SoftState = IEM_SOFTFLOAT_STATE_INITIALIZER_FROM_MXCSR(fMxcsr);
17981 *pi32Dst = f32_to_i32_r_minMag(iemFpSoftF32FromIprt(&r32Src), true /*exact*/, &SoftState);
17982 return fMxcsr | (SoftState.exceptionFlags & X86_MXCSR_XCPT_FLAGS);
17983}
17984
17985
17986IEM_DECL_IMPL_DEF(void, iemAImpl_cvttps2pi_u128,(uint32_t *pfMxcsr, uint64_t *pu64Dst, uint64_t u64Src))
17987{
17988 RTUINT64U uDst;
17989 RTUINT64U uSrc = { u64Src };
17990 uint32_t fMxcsrOut = iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[0], (PCRTFLOAT32U)&uSrc.au32[0]);
17991 fMxcsrOut |= iemAImpl_cvttps2pi_u128_worker(*pfMxcsr, &uDst.ai32[1], (PCRTFLOAT32U)&uSrc.au32[1]);
17992 *pu64Dst = uDst.u;
17993 *pfMxcsr = fMxcsrOut;
17994}
17995#endif
17996
17997/**
17998 * RDRAND
17999 */
18000IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18001{
18002 *puDst = 0;
18003 *pEFlags &= ~X86_EFL_STATUS_BITS;
18004 *pEFlags |= X86_EFL_CF;
18005}
18006
18007IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18008{
18009 *puDst = 0;
18010 *pEFlags &= ~X86_EFL_STATUS_BITS;
18011 *pEFlags |= X86_EFL_CF;
18012}
18013
18014IEM_DECL_IMPL_DEF(void, iemAImpl_rdrand_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18015{
18016 *puDst = 0;
18017 *pEFlags &= ~X86_EFL_STATUS_BITS;
18018 *pEFlags |= X86_EFL_CF;
18019}
18020
18021/**
18022 * RDSEED
18023 */
18024IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u16_fallback,(uint16_t *puDst, uint32_t *pEFlags))
18025{
18026 *puDst = 0;
18027 *pEFlags &= ~X86_EFL_STATUS_BITS;
18028 *pEFlags |= X86_EFL_CF;
18029}
18030
18031IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u32_fallback,(uint32_t *puDst, uint32_t *pEFlags))
18032{
18033 *puDst = 0;
18034 *pEFlags &= ~X86_EFL_STATUS_BITS;
18035 *pEFlags |= X86_EFL_CF;
18036}
18037
18038IEM_DECL_IMPL_DEF(void, iemAImpl_rdseed_u64_fallback,(uint64_t *puDst, uint32_t *pEFlags))
18039{
18040 *puDst = 0;
18041 *pEFlags &= ~X86_EFL_STATUS_BITS;
18042 *pEFlags |= X86_EFL_CF;
18043}
18044
18045
18046/**
18047 * SHA1NEXTE
18048 */
18049IEM_DECL_IMPL_DEF(void, iemAImpl_sha1nexte_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18050{
18051 uint32_t u32Tmp = ASMRotateLeftU32(puDst->au32[3], 30);
18052
18053 puDst->au32[0] = puSrc->au32[0];
18054 puDst->au32[1] = puSrc->au32[1];
18055 puDst->au32[2] = puSrc->au32[2];
18056 puDst->au32[3] = puSrc->au32[3] + u32Tmp;
18057}
18058
18059/**
18060 * SHA1MSG1
18061 */
18062IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18063{
18064 uint32_t u32W0 = puDst->au32[3];
18065 uint32_t u32W1 = puDst->au32[2];
18066 uint32_t u32W2 = puDst->au32[1];
18067 uint32_t u32W3 = puDst->au32[0];
18068 uint32_t u32W4 = puSrc->au32[3];
18069 uint32_t u32W5 = puSrc->au32[2];
18070
18071 puDst->au32[3] = u32W2 ^ u32W0;
18072 puDst->au32[2] = u32W3 ^ u32W1;
18073 puDst->au32[1] = u32W4 ^ u32W2;
18074 puDst->au32[0] = u32W5 ^ u32W3;
18075}
18076
18077/**
18078 * SHA1MSG2
18079 */
18080IEM_DECL_IMPL_DEF(void, iemAImpl_sha1msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18081{
18082 uint32_t u32W13 = puSrc->au32[2];
18083 uint32_t u32W14 = puSrc->au32[1];
18084 uint32_t u32W15 = puSrc->au32[0];
18085 uint32_t u32W16 = ASMRotateLeftU32(puDst->au32[3] ^ u32W13, 1);
18086 uint32_t u32W17 = ASMRotateLeftU32(puDst->au32[2] ^ u32W14, 1);
18087 uint32_t u32W18 = ASMRotateLeftU32(puDst->au32[1] ^ u32W15, 1);
18088 uint32_t u32W19 = ASMRotateLeftU32(puDst->au32[0] ^ u32W16, 1);
18089
18090 puDst->au32[3] = u32W16;
18091 puDst->au32[2] = u32W17;
18092 puDst->au32[1] = u32W18;
18093 puDst->au32[0] = u32W19;
18094}
18095
18096/**
18097 * SHA1RNDS4
18098 */
18099typedef IEM_DECL_IMPL_TYPE(uint32_t, FNIEMAIMPLSHA1RNDS4FN, (uint32_t u32B, uint32_t u32C, uint32_t u32D));
18100typedef FNIEMAIMPLSHA1RNDS4FN *PFNIEMAIMPLSHA1RNDS4FN;
18101
18102static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f0(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18103{
18104 return (u32B & u32C) ^ (~u32B & u32D);
18105}
18106
18107static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f1(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18108{
18109 return u32B ^ u32C ^ u32D;
18110}
18111
18112static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f2(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18113{
18114 return (u32B & u32C) ^ (u32B & u32D) ^ (u32C & u32D);
18115}
18116
18117static DECLCALLBACK(uint32_t) iemAImpl_sha1rnds4_f3(uint32_t u32B, uint32_t u32C, uint32_t u32D) RT_NOEXCEPT
18118{
18119 return u32B ^ u32C ^ u32D;
18120}
18121
18122IEM_DECL_IMPL_DEF(void, iemAImpl_sha1rnds4_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18123{
18124 static uint32_t s_au32K[] = { UINT32_C(0x5a827999), UINT32_C(0x6ed9eba1), UINT32_C(0x8f1bbcdc), UINT32_C(0xca62c1d6) };
18125 static PFNIEMAIMPLSHA1RNDS4FN s_apfnFn[] = { iemAImpl_sha1rnds4_f0, iemAImpl_sha1rnds4_f1, iemAImpl_sha1rnds4_f2, iemAImpl_sha1rnds4_f3 };
18126
18127 uint32_t au32A[5];
18128 uint32_t au32B[5];
18129 uint32_t au32C[5];
18130 uint32_t au32D[5];
18131 uint32_t au32E[5];
18132 uint32_t au32W[4];
18133 PFNIEMAIMPLSHA1RNDS4FN pfnFn = s_apfnFn[bEvil & 0x3];
18134 uint32_t u32K = s_au32K[bEvil & 0x3];
18135
18136 au32A[0] = puDst->au32[3];
18137 au32B[0] = puDst->au32[2];
18138 au32C[0] = puDst->au32[1];
18139 au32D[0] = puDst->au32[0];
18140 for (uint32_t i = 0; i < RT_ELEMENTS(au32W); i++)
18141 au32W[i] = puSrc->au32[3 - i];
18142
18143 /* Round 0 is a bit different than the other rounds. */
18144 au32A[1] = pfnFn(au32B[0], au32C[0], au32D[0]) + ASMRotateLeftU32(au32A[0], 5) + au32W[0] + u32K;
18145 au32B[1] = au32A[0];
18146 au32C[1] = ASMRotateLeftU32(au32B[0], 30);
18147 au32D[1] = au32C[0];
18148 au32E[1] = au32D[0];
18149
18150 for (uint32_t i = 1; i <= 3; i++)
18151 {
18152 au32A[i + 1] = pfnFn(au32B[i], au32C[i], au32D[i]) + ASMRotateLeftU32(au32A[i], 5) + au32W[i] + au32E[i] + u32K;
18153 au32B[i + 1] = au32A[i];
18154 au32C[i + 1] = ASMRotateLeftU32(au32B[i], 30);
18155 au32D[i + 1] = au32C[i];
18156 au32E[i + 1] = au32D[i];
18157 }
18158
18159 puDst->au32[3] = au32A[4];
18160 puDst->au32[2] = au32B[4];
18161 puDst->au32[1] = au32C[4];
18162 puDst->au32[0] = au32D[4];
18163}
18164
18165
18166/**
18167 * SHA256MSG1
18168 */
18169DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma0(uint32_t u32Val)
18170{
18171 return ASMRotateRightU32(u32Val, 7) ^ ASMRotateRightU32(u32Val, 18) ^ (u32Val >> 3);
18172}
18173
18174IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg1_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18175{
18176 uint32_t u32W4 = puSrc->au32[0];
18177 uint32_t u32W3 = puDst->au32[3];
18178 uint32_t u32W2 = puDst->au32[2];
18179 uint32_t u32W1 = puDst->au32[1];
18180 uint32_t u32W0 = puDst->au32[0];
18181
18182 puDst->au32[3] = u32W3 + iemAImpl_sha256_lower_sigma0(u32W4);
18183 puDst->au32[2] = u32W2 + iemAImpl_sha256_lower_sigma0(u32W3);
18184 puDst->au32[1] = u32W1 + iemAImpl_sha256_lower_sigma0(u32W2);
18185 puDst->au32[0] = u32W0 + iemAImpl_sha256_lower_sigma0(u32W1);
18186}
18187
18188/**
18189 * SHA256MSG2
18190 */
18191DECLINLINE(uint32_t) iemAImpl_sha256_lower_sigma1(uint32_t u32Val)
18192{
18193 return ASMRotateRightU32(u32Val, 17) ^ ASMRotateRightU32(u32Val, 19) ^ (u32Val >> 10);
18194}
18195
18196IEM_DECL_IMPL_DEF(void, iemAImpl_sha256msg2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc))
18197{
18198 uint32_t u32W14 = puSrc->au32[2];
18199 uint32_t u32W15 = puSrc->au32[3];
18200 uint32_t u32W16 = puDst->au32[0] + iemAImpl_sha256_lower_sigma1(u32W14);
18201 uint32_t u32W17 = puDst->au32[1] + iemAImpl_sha256_lower_sigma1(u32W15);
18202 uint32_t u32W18 = puDst->au32[2] + iemAImpl_sha256_lower_sigma1(u32W16);
18203 uint32_t u32W19 = puDst->au32[3] + iemAImpl_sha256_lower_sigma1(u32W17);
18204
18205 puDst->au32[3] = u32W19;
18206 puDst->au32[2] = u32W18;
18207 puDst->au32[1] = u32W17;
18208 puDst->au32[0] = u32W16;
18209}
18210
18211/**
18212 * SHA256RNDS2
18213 */
18214DECLINLINE(uint32_t) iemAImpl_sha256_ch(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18215{
18216 return (u32X & u32Y) ^ (~u32X & u32Z);
18217}
18218
18219DECLINLINE(uint32_t) iemAImpl_sha256_maj(uint32_t u32X, uint32_t u32Y, uint32_t u32Z)
18220{
18221 return (u32X & u32Y) ^ (u32X & u32Z) ^ (u32Y & u32Z);
18222}
18223
18224DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma0(uint32_t u32Val)
18225{
18226 return ASMRotateRightU32(u32Val, 2) ^ ASMRotateRightU32(u32Val, 13) ^ ASMRotateRightU32(u32Val, 22);
18227}
18228
18229DECLINLINE(uint32_t) iemAImpl_sha256_upper_sigma1(uint32_t u32Val)
18230{
18231 return ASMRotateRightU32(u32Val, 6) ^ ASMRotateRightU32(u32Val, 11) ^ ASMRotateRightU32(u32Val, 25);
18232}
18233
18234IEM_DECL_IMPL_DEF(void, iemAImpl_sha256rnds2_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, PCRTUINT128U puXmm0Constants))
18235{
18236 uint32_t au32A[3];
18237 uint32_t au32B[3];
18238 uint32_t au32C[3];
18239 uint32_t au32D[3];
18240 uint32_t au32E[3];
18241 uint32_t au32F[3];
18242 uint32_t au32G[3];
18243 uint32_t au32H[3];
18244 uint32_t au32WK[2];
18245
18246 au32A[0] = puSrc->au32[3];
18247 au32B[0] = puSrc->au32[2];
18248 au32C[0] = puDst->au32[3];
18249 au32D[0] = puDst->au32[2];
18250 au32E[0] = puSrc->au32[1];
18251 au32F[0] = puSrc->au32[0];
18252 au32G[0] = puDst->au32[1];
18253 au32H[0] = puDst->au32[0];
18254
18255 au32WK[0] = puXmm0Constants->au32[0];
18256 au32WK[1] = puXmm0Constants->au32[1];
18257
18258 for (uint32_t i = 0; i < 2; i++)
18259 {
18260 au32A[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18261 + iemAImpl_sha256_upper_sigma1(au32E[i])
18262 + au32WK[i]
18263 + au32H[i]
18264 + iemAImpl_sha256_maj(au32A[i], au32B[i], au32C[i])
18265 + iemAImpl_sha256_upper_sigma0(au32A[i]);
18266 au32B[i + 1] = au32A[i];
18267 au32C[i + 1] = au32B[i];
18268 au32D[i + 1] = au32C[i];
18269 au32E[i + 1] = iemAImpl_sha256_ch(au32E[i], au32F[i], au32G[i])
18270 + iemAImpl_sha256_upper_sigma1(au32E[i])
18271 + au32WK[i]
18272 + au32H[i]
18273 + au32D[i];
18274 au32F[i + 1] = au32E[i];
18275 au32G[i + 1] = au32F[i];
18276 au32H[i + 1] = au32G[i];
18277 }
18278
18279 puDst->au32[3] = au32A[2];
18280 puDst->au32[2] = au32B[2];
18281 puDst->au32[1] = au32E[2];
18282 puDst->au32[0] = au32F[2];
18283}
18284
18285
18286/**
18287 * ADCX
18288 */
18289#define ADX_EMIT(a_Flag, a_Type, a_Max) \
18290 do \
18291 { \
18292 bool f = RT_BOOL(*pfEFlags & (a_Flag)); \
18293 a_Type uTmp = *puDst + uSrc; \
18294 if (uTmp < uSrc) \
18295 *pfEFlags |= (a_Flag); \
18296 else \
18297 *pfEFlags &= ~(a_Flag); \
18298 if ( uTmp == a_Max \
18299 && f) \
18300 *pfEFlags |= (a_Flag); \
18301 if (f) \
18302 uTmp++; \
18303 *puDst = uTmp; \
18304 } \
18305 while (0)
18306
18307IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u32_fallback,(uint32_t *puDst, uint32_t *pfEFlags, uint32_t uSrc))
18308{
18309 ADX_EMIT(X86_EFL_CF, uint32_t, UINT32_MAX);
18310}
18311
18312IEM_DECL_IMPL_DEF(void, iemAImpl_adcx_u64_fallback,(uint64_t *puDst, uint32_t *pfEFlags, uint64_t uSrc))
18313{
18314 ADX_EMIT(X86_EFL_CF, uint64_t, UINT64_MAX);
18315}
18316
18317
18318/**
18319 * ADOX
18320 */
18321IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u32_fallback,(uint32_t *puDst, uint32_t *pfEFlags, uint32_t uSrc))
18322{
18323 ADX_EMIT(X86_EFL_OF, uint32_t, UINT32_MAX);
18324}
18325
18326IEM_DECL_IMPL_DEF(void, iemAImpl_adox_u64_fallback,(uint64_t *puDst, uint32_t *pfEFlags, uint64_t uSrc))
18327{
18328 ADX_EMIT(X86_EFL_OF, uint64_t, UINT64_MAX);
18329}
18330
18331
18332/**
18333 * MPSADBW
18334 */
18335IEM_DECL_IMPL_DEF(void, iemAImpl_mpsadbw_u128_fallback,(PRTUINT128U puDst, PCRTUINT128U puSrc, uint8_t bEvil))
18336{
18337 uint8_t idxSrc2 = (bEvil & 0x3) * sizeof(uint32_t);
18338 uint8_t idxSrc1 = ((bEvil >> 2) & 0x1) * sizeof(uint32_t);
18339 int16_t ai16Src1[11];
18340 int16_t ai16Src2[4];
18341
18342 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src1); i++)
18343 ai16Src1[i] = puDst->au8[idxSrc1 + i];
18344
18345 for (uint32_t i = 0; i < RT_ELEMENTS(ai16Src2); i++)
18346 ai16Src2[i] = puSrc->au8[idxSrc2 + i];
18347
18348 for (uint8_t i = 0; i < RT_ELEMENTS(puDst->au16); i++)
18349 puDst->au16[i] = RT_ABS(ai16Src1[i] - ai16Src2[0])
18350 + RT_ABS(ai16Src1[i + 1] - ai16Src2[1])
18351 + RT_ABS(ai16Src1[i + 2] - ai16Src2[2])
18352 + RT_ABS(ai16Src1[i + 3] - ai16Src2[3]);
18353}
18354
18355
18356/**
18357 * DPPS
18358 */
18359IEM_DECL_IMPL_DEF(void, iemAImpl_dpps_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18360{
18361 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18362 AssertReleaseFailed();
18363}
18364
18365
18366/**
18367 * DPPD
18368 */
18369IEM_DECL_IMPL_DEF(void, iemAImpl_dppd_u128_fallback,(uint32_t *pfMxcsr, PX86XMMREG puDst, PCIEMMEDIAF2XMMSRC pSrc, uint8_t bImm))
18370{
18371 RT_NOREF(pfMxcsr, puDst, pSrc, bImm);
18372 AssertReleaseFailed();
18373}
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette