VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 102868

最後變更 在這個檔案從102868是 102817,由 vboxsync 提交於 15 月 前

IEM: Added RCPSS/RCPPS assembly imlementation and C stubs.

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 192.0 KB
 
1; $Id: IEMAllAImpl.asm 102817 2024-01-10 13:56:06Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.alldomusa.eu.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %define A3_8 cl
143 %endif
144
145 %ifdef ASM_CALL64_MSC
146 %define A0 rcx
147 %define A0_32 ecx
148 %define A0_16 cx
149 %define A0_8 cl
150
151 %define A1 rdx
152 %define A1_32 edx
153 %define A1_16 dx
154 %define A1_8 dl
155
156 %define A2 r8
157 %define A2_32 r8d
158 %define A2_16 r8w
159 %define A2_8 r8b
160
161 %define A3 r9
162 %define A3_32 r9d
163 %define A3_16 r9w
164 %define A3_8 r9b
165 %endif
166
167 %define T0 rax
168 %define T0_32 eax
169 %define T0_16 ax
170 %define T0_8 al
171
172 %define T1 r11
173 %define T1_32 r11d
174 %define T1_16 r11w
175 %define T1_8 r11b
176
177 %define T2 r10 ; only AMD64
178 %define T2_32 r10d
179 %define T2_16 r10w
180 %define T2_8 r10b
181
182%else
183 ; x86
184 %macro PROLOGUE_1_ARGS 0
185 push edi
186 %endmacro
187 %macro EPILOGUE_1_ARGS 0
188 pop edi
189 ret 0
190 %endmacro
191 %macro EPILOGUE_1_ARGS_EX 1
192 pop edi
193 ret %1
194 %endmacro
195
196 %macro PROLOGUE_2_ARGS 0
197 push edi
198 %endmacro
199 %macro EPILOGUE_2_ARGS 0
200 pop edi
201 ret 0
202 %endmacro
203 %macro EPILOGUE_2_ARGS_EX 1
204 pop edi
205 ret %1
206 %endmacro
207
208 %macro PROLOGUE_3_ARGS 0
209 push ebx
210 mov ebx, [esp + 4 + 4]
211 push edi
212 %endmacro
213 %macro EPILOGUE_3_ARGS_EX 1
214 %if (%1) < 4
215 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
216 %endif
217 pop edi
218 pop ebx
219 ret %1
220 %endmacro
221 %macro EPILOGUE_3_ARGS 0
222 EPILOGUE_3_ARGS_EX 4
223 %endmacro
224
225 %macro PROLOGUE_4_ARGS 0
226 push ebx
227 push edi
228 push esi
229 mov ebx, [esp + 12 + 4 + 0]
230 mov esi, [esp + 12 + 4 + 4]
231 %endmacro
232 %macro EPILOGUE_4_ARGS_EX 1
233 %if (%1) < 8
234 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
235 %endif
236 pop esi
237 pop edi
238 pop ebx
239 ret %1
240 %endmacro
241 %macro EPILOGUE_4_ARGS 0
242 EPILOGUE_4_ARGS_EX 8
243 %endmacro
244
245 %define A0 ecx
246 %define A0_32 ecx
247 %define A0_16 cx
248 %define A0_8 cl
249
250 %define A1 edx
251 %define A1_32 edx
252 %define A1_16 dx
253 %define A1_8 dl
254
255 %define A2 ebx
256 %define A2_32 ebx
257 %define A2_16 bx
258 %define A2_8 bl
259
260 %define A3 esi
261 %define A3_32 esi
262 %define A3_16 si
263
264 %define T0 eax
265 %define T0_32 eax
266 %define T0_16 ax
267 %define T0_8 al
268
269 %define T1 edi
270 %define T1_32 edi
271 %define T1_16 di
272%endif
273
274
275;;
276; Load the relevant flags from [%1] if there are undefined flags (%3).
277;
278; @remarks Clobbers T0, stack. Changes EFLAGS.
279; @param A2 The register pointing to the flags.
280; @param 1 The parameter (A0..A3) pointing to the eflags.
281; @param 2 The set of modified flags.
282; @param 3 The set of undefined flags.
283;
284%macro IEM_MAYBE_LOAD_FLAGS 3
285 ;%if (%3) != 0
286 pushf ; store current flags
287 mov T0_32, [%1] ; load the guest flags
288 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
289 and T0_32, (%2 | %3) ; select the modified and undefined flags.
290 or [xSP], T0 ; merge guest flags with host flags.
291 popf ; load the mixed flags.
292 ;%endif
293%endmacro
294
295;;
296; Load the relevant flags from [%1].
297;
298; @remarks Clobbers T0, stack. Changes EFLAGS.
299; @param A2 The register pointing to the flags.
300; @param 1 The parameter (A0..A3) pointing to the eflags.
301; @param 2 The set of flags to load.
302; @param 3 The set of undefined flags.
303;
304%macro IEM_LOAD_FLAGS 3
305 pushf ; store current flags
306 mov T0_32, [%1] ; load the guest flags
307 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
308 and T0_32, (%2 | %3) ; select the modified and undefined flags.
309 or [xSP], T0 ; merge guest flags with host flags.
310 popf ; load the mixed flags.
311%endmacro
312
313;;
314; Update the flag.
315;
316; @remarks Clobbers T0, T1, stack.
317; @param 1 The register pointing to the EFLAGS.
318; @param 2 The mask of modified flags to save.
319; @param 3 The mask of undefined flags to (maybe) save.
320;
321%macro IEM_SAVE_FLAGS 3
322 %if (%2 | %3) != 0
323 pushf
324 pop T1
325 mov T0_32, [%1] ; flags
326 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
327 and T1_32, (%2 | %3) ; select the modified and undefined flags.
328 or T0_32, T1_32 ; combine the flags.
329 mov [%1], T0_32 ; save the flags.
330 %endif
331%endmacro
332
333;;
334; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
335;
336; @remarks Clobbers T0, T1, stack.
337; @param 1 The register pointing to the EFLAGS.
338; @param 2 The mask of modified flags to save.
339; @param 3 Mask of additional flags to always clear
340; @param 4 Mask of additional flags to always set.
341;
342%macro IEM_SAVE_AND_ADJUST_FLAGS 4
343 %if (%2 | %3 | %4) != 0
344 pushf
345 pop T1
346 mov T0_32, [%1] ; load flags.
347 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
348 and T1_32, (%2) ; select the modified flags.
349 or T0_32, T1_32 ; combine the flags.
350 %if (%4) != 0
351 or T0_32, %4 ; add the always set flags.
352 %endif
353 mov [%1], T0_32 ; save the result.
354 %endif
355%endmacro
356
357;;
358; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
359; signed input (%4[%5]) and parity index (%6).
360;
361; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
362; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
363; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
364;
365; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
366; @param 1 The register pointing to the EFLAGS.
367; @param 2 The mask of modified flags to save.
368; @param 3 Mask of additional flags to always clear
369; @param 4 The result register to set SF by.
370; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
371; @param 6 The (full) register containing the parity table index. Will be modified!
372
373%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
374 %ifdef RT_ARCH_AMD64
375 pushf
376 pop T2
377 %else
378 push T0
379 pushf
380 pop T0
381 %endif
382 mov T1_32, [%1] ; load flags.
383 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
384 %ifdef RT_ARCH_AMD64
385 and T2_32, (%2) ; select the modified flags.
386 or T1_32, T2_32 ; combine the flags.
387 %else
388 and T0_32, (%2) ; select the modified flags.
389 or T1_32, T0_32 ; combine the flags.
390 pop T0
391 %endif
392
393 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
394 bt %4, %5 - 1
395 jnc %%sf_clear
396 or T1_32, X86_EFL_SF
397 %%sf_clear:
398
399 ; Parity last.
400 and %6, 0xff
401 %ifdef RT_ARCH_AMD64
402 lea T2, [NAME(g_afParity) xWrtRIP]
403 or T1_8, [T2 + %6]
404 %else
405 or T1_8, [NAME(g_afParity) + %6]
406 %endif
407
408 mov [%1], T1_32 ; save the result.
409%endmacro
410
411;;
412; Calculates the new EFLAGS using fixed clear and set bit masks.
413;
414; @remarks Clobbers T0.
415; @param 1 The register pointing to the EFLAGS.
416; @param 2 Mask of additional flags to always clear
417; @param 3 Mask of additional flags to always set.
418;
419%macro IEM_ADJUST_FLAGS 3
420 %if (%2 | %3) != 0
421 mov T0_32, [%1] ; Load flags.
422 %if (%2) != 0
423 and T0_32, ~(%2) ; Remove the always cleared flags.
424 %endif
425 %if (%3) != 0
426 or T0_32, %3 ; Add the always set flags.
427 %endif
428 mov [%1], T0_32 ; Save the result.
429 %endif
430%endmacro
431
432;;
433; Calculates the new EFLAGS using fixed clear and set bit masks.
434;
435; @remarks Clobbers T0, %4, EFLAGS.
436; @param 1 The register pointing to the EFLAGS.
437; @param 2 Mask of additional flags to always clear
438; @param 3 Mask of additional flags to always set.
439; @param 4 The (full) register containing the parity table index. Will be modified!
440;
441%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
442 mov T0_32, [%1] ; Load flags.
443 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
444 %if (%3) != 0
445 or T0_32, %3 ; Add the always set flags.
446 %endif
447 and %4, 0xff
448 %ifdef RT_ARCH_AMD64
449 lea T2, [NAME(g_afParity) xWrtRIP]
450 or T0_8, [T2 + %4]
451 %else
452 or T0_8, [NAME(g_afParity) + %4]
453 %endif
454 mov [%1], T0_32 ; Save the result.
455%endmacro
456
457
458;;
459; Checks that the size expression %1 matches %2 adjusted according to
460; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
461; @param 1 The jump array size assembly expression.
462; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
463;
464%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
465 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
466 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
467 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
468 %else
469 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
470 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
471 %endif
472%endmacro
473
474
475;*********************************************************************************************************************************
476;* External Symbols *
477;*********************************************************************************************************************************
478extern NAME(g_afParity)
479
480
481;;
482; Macro for implementing a binary operator.
483;
484; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
485; variants, except on 32-bit system where the 64-bit accesses requires hand
486; coding.
487;
488; All the functions takes a pointer to the destination memory operand in A0,
489; the source register operand in A1 and a pointer to eflags in A2.
490;
491; @param 1 The instruction mnemonic.
492; @param 2 Non-zero if there should be a locked version.
493; @param 3 The modified flags.
494; @param 4 The undefined flags.
495;
496%macro IEMIMPL_BIN_OP 4
497BEGINCODE
498BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
499 PROLOGUE_3_ARGS
500 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
501 %1 byte [A0], A1_8
502 IEM_SAVE_FLAGS A2, %3, %4
503 EPILOGUE_3_ARGS
504ENDPROC iemAImpl_ %+ %1 %+ _u8
505
506BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
507 PROLOGUE_3_ARGS
508 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
509 %1 word [A0], A1_16
510 IEM_SAVE_FLAGS A2, %3, %4
511 EPILOGUE_3_ARGS
512ENDPROC iemAImpl_ %+ %1 %+ _u16
513
514BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
515 PROLOGUE_3_ARGS
516 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
517 %1 dword [A0], A1_32
518 IEM_SAVE_FLAGS A2, %3, %4
519 EPILOGUE_3_ARGS
520ENDPROC iemAImpl_ %+ %1 %+ _u32
521
522 %ifdef RT_ARCH_AMD64
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 %1 qword [A0], A1
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS_EX 8
529ENDPROC iemAImpl_ %+ %1 %+ _u64
530 %endif ; RT_ARCH_AMD64
531
532 %if %2 != 0 ; locked versions requested?
533
534BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
535 PROLOGUE_3_ARGS
536 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
537 lock %1 byte [A0], A1_8
538 IEM_SAVE_FLAGS A2, %3, %4
539 EPILOGUE_3_ARGS
540ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
541
542BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
543 PROLOGUE_3_ARGS
544 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
545 lock %1 word [A0], A1_16
546 IEM_SAVE_FLAGS A2, %3, %4
547 EPILOGUE_3_ARGS
548ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
549
550BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
551 PROLOGUE_3_ARGS
552 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
553 lock %1 dword [A0], A1_32
554 IEM_SAVE_FLAGS A2, %3, %4
555 EPILOGUE_3_ARGS
556ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
557
558 %ifdef RT_ARCH_AMD64
559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
560 PROLOGUE_3_ARGS
561 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
562 lock %1 qword [A0], A1
563 IEM_SAVE_FLAGS A2, %3, %4
564 EPILOGUE_3_ARGS_EX 8
565ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
566 %endif ; RT_ARCH_AMD64
567 %endif ; locked
568%endmacro
569
570; instr,lock, modified-flags, undefined flags
571IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
572IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
573IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
574IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
575IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
576IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
577IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
578IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
579IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
580
581
582;;
583; Macro for implementing a binary operator, VEX variant with separate input/output.
584;
585; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
586; where the 64-bit accesses requires hand coding.
587;
588; All the functions takes a pointer to the destination memory operand in A0,
589; the first source register operand in A1, the second source register operand
590; in A2 and a pointer to eflags in A3.
591;
592; @param 1 The instruction mnemonic.
593; @param 2 The modified flags.
594; @param 3 The undefined flags.
595;
596%macro IEMIMPL_VEX_BIN_OP 3
597BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
598 PROLOGUE_4_ARGS
599 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
600 %1 T0_32, A1_32, A2_32
601 mov [A0], T0_32
602 IEM_SAVE_FLAGS A3, %2, %3
603 EPILOGUE_4_ARGS
604ENDPROC iemAImpl_ %+ %1 %+ _u32
605
606 %ifdef RT_ARCH_AMD64
607BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
608 PROLOGUE_4_ARGS
609 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
610 %1 T0, A1, A2
611 mov [A0], T0
612 IEM_SAVE_FLAGS A3, %2, %3
613 EPILOGUE_4_ARGS
614ENDPROC iemAImpl_ %+ %1 %+ _u64
615 %endif ; RT_ARCH_AMD64
616%endmacro
617
618; instr, modified-flags, undefined-flags
619IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
620IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
621IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
622
623;;
624; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
625;
626; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
627; where the 64-bit accesses requires hand coding.
628;
629; All the functions takes a pointer to the destination memory operand in A0,
630; the source register operand in A1 and a pointer to eflags in A2.
631;
632; @param 1 The instruction mnemonic.
633; @param 2 The modified flags.
634; @param 3 The undefined flags.
635;
636%macro IEMIMPL_VEX_BIN_OP_2 3
637BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
638 PROLOGUE_4_ARGS
639 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
640 mov T0_32, [A0]
641 %1 T0_32, A1_32
642 mov [A0], T0_32
643 IEM_SAVE_FLAGS A2, %2, %3
644 EPILOGUE_4_ARGS
645ENDPROC iemAImpl_ %+ %1 %+ _u32
646
647 %ifdef RT_ARCH_AMD64
648BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
649 PROLOGUE_4_ARGS
650 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
651 mov T0, [A0]
652 %1 T0, A1
653 mov [A0], T0
654 IEM_SAVE_FLAGS A2, %2, %3
655 EPILOGUE_4_ARGS
656ENDPROC iemAImpl_ %+ %1 %+ _u64
657 %endif ; RT_ARCH_AMD64
658%endmacro
659
660; instr, modified-flags, undefined-flags
661IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
662IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
663IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
664
665
666;;
667; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
668;
669; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
670; where the 64-bit accesses requires hand coding.
671;
672; All the functions takes a pointer to the destination memory operand in A0,
673; the first source register operand in A1, the second source register operand
674; in A2 and a pointer to eflags in A3.
675;
676; @param 1 The instruction mnemonic.
677; @param 2 Fallback instruction if applicable.
678; @param 3 Whether to emit fallback or not.
679;
680%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
681BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
682 PROLOGUE_3_ARGS
683 %1 T0_32, A1_32, A2_32
684 mov [A0], T0_32
685 EPILOGUE_3_ARGS
686ENDPROC iemAImpl_ %+ %1 %+ _u32
687
688 %if %3
689BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
690 PROLOGUE_3_ARGS
691 %ifdef ASM_CALL64_GCC
692 mov cl, A2_8
693 %2 A1_32, cl
694 mov [A0], A1_32
695 %else
696 xchg A2, A0
697 %2 A1_32, cl
698 mov [A2], A1_32
699 %endif
700 EPILOGUE_3_ARGS
701ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
702 %endif
703
704 %ifdef RT_ARCH_AMD64
705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
706 PROLOGUE_3_ARGS
707 %1 T0, A1, A2
708 mov [A0], T0
709 EPILOGUE_3_ARGS
710ENDPROC iemAImpl_ %+ %1 %+ _u64
711
712 %if %3
713BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
714 PROLOGUE_3_ARGS
715 %ifdef ASM_CALL64_GCC
716 mov cl, A2_8
717 %2 A1, cl
718 mov [A0], A1_32
719 %else
720 xchg A2, A0
721 %2 A1, cl
722 mov [A2], A1_32
723 %endif
724 mov [A0], A1
725 EPILOGUE_3_ARGS
726ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
727 %endif
728 %endif ; RT_ARCH_AMD64
729%endmacro
730
731; instr, fallback instr, emit fallback
732IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
733IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
734IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
735IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
736IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
737
738
739;
740; RORX uses a immediate byte for the shift count, so we only do
741; fallback implementation of that one.
742;
743BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
744 PROLOGUE_3_ARGS
745 %ifdef ASM_CALL64_GCC
746 mov cl, A2_8
747 ror A1_32, cl
748 mov [A0], A1_32
749 %else
750 xchg A2, A0
751 ror A1_32, cl
752 mov [A2], A1_32
753 %endif
754 EPILOGUE_3_ARGS
755ENDPROC iemAImpl_rorx_u32
756
757 %ifdef RT_ARCH_AMD64
758BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
759 PROLOGUE_3_ARGS
760 %ifdef ASM_CALL64_GCC
761 mov cl, A2_8
762 ror A1, cl
763 mov [A0], A1
764 %else
765 xchg A2, A0
766 ror A1, cl
767 mov [A2], A1
768 %endif
769 EPILOGUE_3_ARGS
770ENDPROC iemAImpl_rorx_u64
771 %endif ; RT_ARCH_AMD64
772
773
774;
775; MULX
776;
777BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
778 PROLOGUE_4_ARGS
779%ifdef ASM_CALL64_GCC
780 ; A2_32 is EDX - prefect
781 mulx T0_32, T1_32, A3_32
782 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
783 mov [A0], T0_32
784%else
785 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
786 xchg A1, A2
787 mulx T0_32, T1_32, A3_32
788 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
789 mov [A0], T0_32
790%endif
791 EPILOGUE_4_ARGS
792ENDPROC iemAImpl_mulx_u32
793
794
795BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
796 PROLOGUE_4_ARGS
797%ifdef ASM_CALL64_GCC
798 ; A2_32 is EDX, T0_32 is EAX
799 mov eax, A3_32
800 mul A2_32
801 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
802 mov [A0], edx
803%else
804 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
805 xchg A1, A2
806 mov eax, A3_32
807 mul A2_32
808 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
809 mov [A0], edx
810%endif
811 EPILOGUE_4_ARGS
812ENDPROC iemAImpl_mulx_u32_fallback
813
814%ifdef RT_ARCH_AMD64
815BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
816 PROLOGUE_4_ARGS
817%ifdef ASM_CALL64_GCC
818 ; A2 is RDX - prefect
819 mulx T0, T1, A3
820 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
821 mov [A0], T0
822%else
823 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
824 xchg A1, A2
825 mulx T0, T1, A3
826 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
827 mov [A0], T0
828%endif
829 EPILOGUE_4_ARGS
830ENDPROC iemAImpl_mulx_u64
831
832
833BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
834 PROLOGUE_4_ARGS
835%ifdef ASM_CALL64_GCC
836 ; A2 is RDX, T0 is RAX
837 mov rax, A3
838 mul A2
839 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
840 mov [A0], rdx
841%else
842 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
843 xchg A1, A2
844 mov rax, A3
845 mul A2
846 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
847 mov [A0], rdx
848%endif
849 EPILOGUE_4_ARGS
850ENDPROC iemAImpl_mulx_u64_fallback
851
852%endif
853
854
855;;
856; Macro for implementing a bit operator.
857;
858; This will generate code for the 16, 32 and 64 bit accesses with locked
859; variants, except on 32-bit system where the 64-bit accesses requires hand
860; coding.
861;
862; All the functions takes a pointer to the destination memory operand in A0,
863; the source register operand in A1 and a pointer to eflags in A2.
864;
865; @param 1 The instruction mnemonic.
866; @param 2 Non-zero if there should be a locked version.
867; @param 3 The modified flags.
868; @param 4 The undefined flags.
869;
870%macro IEMIMPL_BIT_OP 4
871BEGINCODE
872BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
873 PROLOGUE_3_ARGS
874 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
875 %1 word [A0], A1_16
876 IEM_SAVE_FLAGS A2, %3, %4
877 EPILOGUE_3_ARGS
878ENDPROC iemAImpl_ %+ %1 %+ _u16
879
880BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
881 PROLOGUE_3_ARGS
882 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
883 %1 dword [A0], A1_32
884 IEM_SAVE_FLAGS A2, %3, %4
885 EPILOGUE_3_ARGS
886ENDPROC iemAImpl_ %+ %1 %+ _u32
887
888 %ifdef RT_ARCH_AMD64
889BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
890 PROLOGUE_3_ARGS
891 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
892 %1 qword [A0], A1
893 IEM_SAVE_FLAGS A2, %3, %4
894 EPILOGUE_3_ARGS_EX 8
895ENDPROC iemAImpl_ %+ %1 %+ _u64
896 %endif ; RT_ARCH_AMD64
897
898 %if %2 != 0 ; locked versions requested?
899
900BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
901 PROLOGUE_3_ARGS
902 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
903 lock %1 word [A0], A1_16
904 IEM_SAVE_FLAGS A2, %3, %4
905 EPILOGUE_3_ARGS
906ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
907
908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
909 PROLOGUE_3_ARGS
910 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
911 lock %1 dword [A0], A1_32
912 IEM_SAVE_FLAGS A2, %3, %4
913 EPILOGUE_3_ARGS
914ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
915
916 %ifdef RT_ARCH_AMD64
917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
918 PROLOGUE_3_ARGS
919 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
920 lock %1 qword [A0], A1
921 IEM_SAVE_FLAGS A2, %3, %4
922 EPILOGUE_3_ARGS_EX 8
923ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
924 %endif ; RT_ARCH_AMD64
925 %endif ; locked
926%endmacro
927IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
928IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
929IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
930IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
931
932;;
933; Macro for implementing a bit search operator.
934;
935; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
936; system where the 64-bit accesses requires hand coding.
937;
938; All the functions takes a pointer to the destination memory operand in A0,
939; the source register operand in A1 and a pointer to eflags in A2.
940;
941; In the ZF case the destination register is 'undefined', however it seems that
942; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
943; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
944; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
945; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
946;
947; @param 1 The instruction mnemonic.
948; @param 2 The modified flags.
949; @param 3 The undefined flags.
950; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
951;
952%macro IEMIMPL_BIT_OP2 4
953BEGINCODE
954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
955 PROLOGUE_3_ARGS
956 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
957 %1 T0_16, A1_16
958%if %4 != 0
959 jz .unchanged_dst
960%endif
961 mov [A0], T0_16
962.unchanged_dst:
963 IEM_SAVE_FLAGS A2, %2, %3
964 EPILOGUE_3_ARGS
965ENDPROC iemAImpl_ %+ %1 %+ _u16
966
967BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
968 PROLOGUE_3_ARGS
969 %1 T1_16, A1_16
970%if %4 != 0
971 jz .unchanged_dst
972%endif
973 mov [A0], T1_16
974 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
975 EPILOGUE_3_ARGS
976.unchanged_dst:
977 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
978 EPILOGUE_3_ARGS
979ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
980
981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
982 PROLOGUE_3_ARGS
983 %1 T0_16, A1_16
984%if %4 != 0
985 jz .unchanged_dst
986%endif
987 mov [A0], T0_16
988.unchanged_dst:
989 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
990 EPILOGUE_3_ARGS
991ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
992
993
994BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
995 PROLOGUE_3_ARGS
996 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
997 %1 T0_32, A1_32
998%if %4 != 0
999 jz .unchanged_dst
1000%endif
1001 mov [A0], T0_32
1002.unchanged_dst:
1003 IEM_SAVE_FLAGS A2, %2, %3
1004 EPILOGUE_3_ARGS
1005ENDPROC iemAImpl_ %+ %1 %+ _u32
1006
1007BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1008 PROLOGUE_3_ARGS
1009 %1 T1_32, A1_32
1010%if %4 != 0
1011 jz .unchanged_dst
1012%endif
1013 mov [A0], T1_32
1014 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1015 EPILOGUE_3_ARGS
1016.unchanged_dst:
1017 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1018 EPILOGUE_3_ARGS
1019ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1020
1021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1022 PROLOGUE_3_ARGS
1023 %1 T0_32, A1_32
1024%if %4 != 0
1025 jz .unchanged_dst
1026%endif
1027 mov [A0], T0_32
1028.unchanged_dst:
1029 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1030 EPILOGUE_3_ARGS
1031ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1032
1033
1034 %ifdef RT_ARCH_AMD64
1035
1036BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1037 PROLOGUE_3_ARGS
1038 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1039 %1 T0, A1
1040%if %4 != 0
1041 jz .unchanged_dst
1042%endif
1043 mov [A0], T0
1044.unchanged_dst:
1045 IEM_SAVE_FLAGS A2, %2, %3
1046 EPILOGUE_3_ARGS_EX 8
1047ENDPROC iemAImpl_ %+ %1 %+ _u64
1048
1049BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1050 PROLOGUE_3_ARGS
1051 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1052 %1 T1, A1
1053%if %4 != 0
1054 jz .unchanged_dst
1055%endif
1056 mov [A0], T1
1057 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1058 EPILOGUE_3_ARGS
1059.unchanged_dst:
1060 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1061 EPILOGUE_3_ARGS
1062ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1063
1064BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1065 PROLOGUE_3_ARGS
1066 %1 T0, A1
1067%if %4 != 0
1068 jz .unchanged_dst
1069%endif
1070 mov [A0], T0
1071.unchanged_dst:
1072 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1073 EPILOGUE_3_ARGS_EX 8
1074ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1075
1076 %endif ; RT_ARCH_AMD64
1077%endmacro
1078
1079IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1080IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1081IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1082IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1083
1084
1085;;
1086; Macro for implementing POPCNT.
1087;
1088; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1089; system where the 64-bit accesses requires hand coding.
1090;
1091; All the functions takes a pointer to the destination memory operand in A0,
1092; the source register operand in A1 and a pointer to eflags in A2.
1093;
1094; ASSUMES Intel and AMD set EFLAGS the same way.
1095;
1096; ASSUMES the instruction does not support memory destination.
1097;
1098; @param 1 The instruction mnemonic.
1099; @param 2 The modified flags.
1100; @param 3 The undefined flags.
1101;
1102%macro IEMIMPL_BIT_OP3 3
1103BEGINCODE
1104BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1105 PROLOGUE_3_ARGS
1106 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1107 %1 T0_16, A1_16
1108 mov [A0], T0_16
1109 IEM_SAVE_FLAGS A2, %2, %3
1110 EPILOGUE_3_ARGS
1111ENDPROC iemAImpl_ %+ %1 %+ _u16
1112
1113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1114 PROLOGUE_3_ARGS
1115 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1116 %1 T0_32, A1_32
1117 mov [A0], T0_32
1118 IEM_SAVE_FLAGS A2, %2, %3
1119 EPILOGUE_3_ARGS
1120ENDPROC iemAImpl_ %+ %1 %+ _u32
1121
1122 %ifdef RT_ARCH_AMD64
1123BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1124 PROLOGUE_3_ARGS
1125 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1126 %1 T0, A1
1127 mov [A0], T0
1128 IEM_SAVE_FLAGS A2, %2, %3
1129 EPILOGUE_3_ARGS_EX 8
1130ENDPROC iemAImpl_ %+ %1 %+ _u64
1131 %endif ; RT_ARCH_AMD64
1132%endmacro
1133IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1134
1135
1136;
1137; IMUL is also a similar but yet different case (no lock, no mem dst).
1138; The rDX:rAX variant of imul is handled together with mul further down.
1139;
1140BEGINCODE
1141; @param 1 EFLAGS that are modified.
1142; @param 2 Undefined EFLAGS.
1143; @param 3 Function suffix.
1144; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1145; 2 for AMD (set AF, clear PF, ZF and SF).
1146%macro IEMIMPL_IMUL_TWO 4
1147BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1148 PROLOGUE_3_ARGS
1149 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1150 imul A1_16, word [A0]
1151 mov [A0], A1_16
1152 %if %4 != 1
1153 IEM_SAVE_FLAGS A2, %1, %2
1154 %else
1155 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1156 %endif
1157 EPILOGUE_3_ARGS
1158ENDPROC iemAImpl_imul_two_u16 %+ %3
1159
1160BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1161 PROLOGUE_3_ARGS
1162 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1163 imul A1_32, dword [A0]
1164 mov [A0], A1_32
1165 %if %4 != 1
1166 IEM_SAVE_FLAGS A2, %1, %2
1167 %else
1168 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1169 %endif
1170 EPILOGUE_3_ARGS
1171ENDPROC iemAImpl_imul_two_u32 %+ %3
1172
1173 %ifdef RT_ARCH_AMD64
1174BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1175 PROLOGUE_3_ARGS
1176 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1177 imul A1, qword [A0]
1178 mov [A0], A1
1179 %if %4 != 1
1180 IEM_SAVE_FLAGS A2, %1, %2
1181 %else
1182 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1183 %endif
1184 EPILOGUE_3_ARGS_EX 8
1185ENDPROC iemAImpl_imul_two_u64 %+ %3
1186 %endif ; RT_ARCH_AMD64
1187%endmacro
1188IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1189IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1190IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1191
1192
1193;
1194; XCHG for memory operands. This implies locking. No flag changes.
1195;
1196; Each function takes two arguments, first the pointer to the memory,
1197; then the pointer to the register. They all return void.
1198;
1199BEGINCODE
1200BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1201 PROLOGUE_2_ARGS
1202 mov T0_8, [A1]
1203 xchg [A0], T0_8
1204 mov [A1], T0_8
1205 EPILOGUE_2_ARGS
1206ENDPROC iemAImpl_xchg_u8_locked
1207
1208BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1209 PROLOGUE_2_ARGS
1210 mov T0_16, [A1]
1211 xchg [A0], T0_16
1212 mov [A1], T0_16
1213 EPILOGUE_2_ARGS
1214ENDPROC iemAImpl_xchg_u16_locked
1215
1216BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1217 PROLOGUE_2_ARGS
1218 mov T0_32, [A1]
1219 xchg [A0], T0_32
1220 mov [A1], T0_32
1221 EPILOGUE_2_ARGS
1222ENDPROC iemAImpl_xchg_u32_locked
1223
1224%ifdef RT_ARCH_AMD64
1225BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1226 PROLOGUE_2_ARGS
1227 mov T0, [A1]
1228 xchg [A0], T0
1229 mov [A1], T0
1230 EPILOGUE_2_ARGS
1231ENDPROC iemAImpl_xchg_u64_locked
1232%endif
1233
1234; Unlocked variants for fDisregardLock mode.
1235
1236BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1237 PROLOGUE_2_ARGS
1238 mov T0_8, [A1]
1239 mov T1_8, [A0]
1240 mov [A0], T0_8
1241 mov [A1], T1_8
1242 EPILOGUE_2_ARGS
1243ENDPROC iemAImpl_xchg_u8_unlocked
1244
1245BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1246 PROLOGUE_2_ARGS
1247 mov T0_16, [A1]
1248 mov T1_16, [A0]
1249 mov [A0], T0_16
1250 mov [A1], T1_16
1251 EPILOGUE_2_ARGS
1252ENDPROC iemAImpl_xchg_u16_unlocked
1253
1254BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1255 PROLOGUE_2_ARGS
1256 mov T0_32, [A1]
1257 mov T1_32, [A0]
1258 mov [A0], T0_32
1259 mov [A1], T1_32
1260 EPILOGUE_2_ARGS
1261ENDPROC iemAImpl_xchg_u32_unlocked
1262
1263%ifdef RT_ARCH_AMD64
1264BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1265 PROLOGUE_2_ARGS
1266 mov T0, [A1]
1267 mov T1, [A0]
1268 mov [A0], T0
1269 mov [A1], T1
1270 EPILOGUE_2_ARGS
1271ENDPROC iemAImpl_xchg_u64_unlocked
1272%endif
1273
1274
1275;
1276; XADD for memory operands.
1277;
1278; Each function takes three arguments, first the pointer to the
1279; memory/register, then the pointer to the register, and finally a pointer to
1280; eflags. They all return void.
1281;
1282BEGINCODE
1283BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1284 PROLOGUE_3_ARGS
1285 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1286 mov T0_8, [A1]
1287 xadd [A0], T0_8
1288 mov [A1], T0_8
1289 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1290 EPILOGUE_3_ARGS
1291ENDPROC iemAImpl_xadd_u8
1292
1293BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1294 PROLOGUE_3_ARGS
1295 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1296 mov T0_16, [A1]
1297 xadd [A0], T0_16
1298 mov [A1], T0_16
1299 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1300 EPILOGUE_3_ARGS
1301ENDPROC iemAImpl_xadd_u16
1302
1303BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1304 PROLOGUE_3_ARGS
1305 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1306 mov T0_32, [A1]
1307 xadd [A0], T0_32
1308 mov [A1], T0_32
1309 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1310 EPILOGUE_3_ARGS
1311ENDPROC iemAImpl_xadd_u32
1312
1313%ifdef RT_ARCH_AMD64
1314BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1315 PROLOGUE_3_ARGS
1316 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1317 mov T0, [A1]
1318 xadd [A0], T0
1319 mov [A1], T0
1320 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1321 EPILOGUE_3_ARGS
1322ENDPROC iemAImpl_xadd_u64
1323%endif ; RT_ARCH_AMD64
1324
1325BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1326 PROLOGUE_3_ARGS
1327 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1328 mov T0_8, [A1]
1329 lock xadd [A0], T0_8
1330 mov [A1], T0_8
1331 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1332 EPILOGUE_3_ARGS
1333ENDPROC iemAImpl_xadd_u8_locked
1334
1335BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1336 PROLOGUE_3_ARGS
1337 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1338 mov T0_16, [A1]
1339 lock xadd [A0], T0_16
1340 mov [A1], T0_16
1341 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1342 EPILOGUE_3_ARGS
1343ENDPROC iemAImpl_xadd_u16_locked
1344
1345BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1346 PROLOGUE_3_ARGS
1347 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1348 mov T0_32, [A1]
1349 lock xadd [A0], T0_32
1350 mov [A1], T0_32
1351 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1352 EPILOGUE_3_ARGS
1353ENDPROC iemAImpl_xadd_u32_locked
1354
1355%ifdef RT_ARCH_AMD64
1356BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1357 PROLOGUE_3_ARGS
1358 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1359 mov T0, [A1]
1360 lock xadd [A0], T0
1361 mov [A1], T0
1362 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1363 EPILOGUE_3_ARGS
1364ENDPROC iemAImpl_xadd_u64_locked
1365%endif ; RT_ARCH_AMD64
1366
1367
1368;
1369; CMPXCHG8B.
1370;
1371; These are tricky register wise, so the code is duplicated for each calling
1372; convention.
1373;
1374; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1375;
1376; C-proto:
1377; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1378; uint32_t *pEFlags));
1379;
1380; Note! Identical to iemAImpl_cmpxchg16b.
1381;
1382BEGINCODE
1383BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1384%ifdef RT_ARCH_AMD64
1385 %ifdef ASM_CALL64_MSC
1386 push rbx
1387
1388 mov r11, rdx ; pu64EaxEdx (is also T1)
1389 mov r10, rcx ; pu64Dst
1390
1391 mov ebx, [r8]
1392 mov ecx, [r8 + 4]
1393 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1394 mov eax, [r11]
1395 mov edx, [r11 + 4]
1396
1397 cmpxchg8b [r10]
1398
1399 mov [r11], eax
1400 mov [r11 + 4], edx
1401 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1402
1403 pop rbx
1404 ret
1405 %else
1406 push rbx
1407
1408 mov r10, rcx ; pEFlags
1409 mov r11, rdx ; pu64EbxEcx (is also T1)
1410
1411 mov ebx, [r11]
1412 mov ecx, [r11 + 4]
1413 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1414 mov eax, [rsi]
1415 mov edx, [rsi + 4]
1416
1417 cmpxchg8b [rdi]
1418
1419 mov [rsi], eax
1420 mov [rsi + 4], edx
1421 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1422
1423 pop rbx
1424 ret
1425
1426 %endif
1427%else
1428 push esi
1429 push edi
1430 push ebx
1431 push ebp
1432
1433 mov edi, ecx ; pu64Dst
1434 mov esi, edx ; pu64EaxEdx
1435 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1436 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1437
1438 mov ebx, [ecx]
1439 mov ecx, [ecx + 4]
1440 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1441 mov eax, [esi]
1442 mov edx, [esi + 4]
1443
1444 cmpxchg8b [edi]
1445
1446 mov [esi], eax
1447 mov [esi + 4], edx
1448 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1449
1450 pop ebp
1451 pop ebx
1452 pop edi
1453 pop esi
1454 ret 8
1455%endif
1456ENDPROC iemAImpl_cmpxchg8b
1457
1458BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1459%ifdef RT_ARCH_AMD64
1460 %ifdef ASM_CALL64_MSC
1461 push rbx
1462
1463 mov r11, rdx ; pu64EaxEdx (is also T1)
1464 mov r10, rcx ; pu64Dst
1465
1466 mov ebx, [r8]
1467 mov ecx, [r8 + 4]
1468 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1469 mov eax, [r11]
1470 mov edx, [r11 + 4]
1471
1472 lock cmpxchg8b [r10]
1473
1474 mov [r11], eax
1475 mov [r11 + 4], edx
1476 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1477
1478 pop rbx
1479 ret
1480 %else
1481 push rbx
1482
1483 mov r10, rcx ; pEFlags
1484 mov r11, rdx ; pu64EbxEcx (is also T1)
1485
1486 mov ebx, [r11]
1487 mov ecx, [r11 + 4]
1488 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1489 mov eax, [rsi]
1490 mov edx, [rsi + 4]
1491
1492 lock cmpxchg8b [rdi]
1493
1494 mov [rsi], eax
1495 mov [rsi + 4], edx
1496 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1497
1498 pop rbx
1499 ret
1500
1501 %endif
1502%else
1503 push esi
1504 push edi
1505 push ebx
1506 push ebp
1507
1508 mov edi, ecx ; pu64Dst
1509 mov esi, edx ; pu64EaxEdx
1510 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1511 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1512
1513 mov ebx, [ecx]
1514 mov ecx, [ecx + 4]
1515 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1516 mov eax, [esi]
1517 mov edx, [esi + 4]
1518
1519 lock cmpxchg8b [edi]
1520
1521 mov [esi], eax
1522 mov [esi + 4], edx
1523 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1524
1525 pop ebp
1526 pop ebx
1527 pop edi
1528 pop esi
1529 ret 8
1530%endif
1531ENDPROC iemAImpl_cmpxchg8b_locked
1532
1533%ifdef RT_ARCH_AMD64
1534
1535;
1536; CMPXCHG16B.
1537;
1538; These are tricky register wise, so the code is duplicated for each calling
1539; convention.
1540;
1541; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1542;
1543; C-proto:
1544; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1545; uint32_t *pEFlags));
1546;
1547; Note! Identical to iemAImpl_cmpxchg8b.
1548;
1549BEGINCODE
1550BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1551 %ifdef ASM_CALL64_MSC
1552 push rbx
1553
1554 mov r11, rdx ; pu64RaxRdx (is also T1)
1555 mov r10, rcx ; pu64Dst
1556
1557 mov rbx, [r8]
1558 mov rcx, [r8 + 8]
1559 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1560 mov rax, [r11]
1561 mov rdx, [r11 + 8]
1562
1563 cmpxchg16b [r10]
1564
1565 mov [r11], rax
1566 mov [r11 + 8], rdx
1567 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1568
1569 pop rbx
1570 ret
1571 %else
1572 push rbx
1573
1574 mov r10, rcx ; pEFlags
1575 mov r11, rdx ; pu64RbxRcx (is also T1)
1576
1577 mov rbx, [r11]
1578 mov rcx, [r11 + 8]
1579 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1580 mov rax, [rsi]
1581 mov rdx, [rsi + 8]
1582
1583 cmpxchg16b [rdi]
1584
1585 mov [rsi], rax
1586 mov [rsi + 8], rdx
1587 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1588
1589 pop rbx
1590 ret
1591
1592 %endif
1593ENDPROC iemAImpl_cmpxchg16b
1594
1595BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1596 %ifdef ASM_CALL64_MSC
1597 push rbx
1598
1599 mov r11, rdx ; pu64RaxRdx (is also T1)
1600 mov r10, rcx ; pu64Dst
1601
1602 mov rbx, [r8]
1603 mov rcx, [r8 + 8]
1604 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1605 mov rax, [r11]
1606 mov rdx, [r11 + 8]
1607
1608 lock cmpxchg16b [r10]
1609
1610 mov [r11], rax
1611 mov [r11 + 8], rdx
1612 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1613
1614 pop rbx
1615 ret
1616 %else
1617 push rbx
1618
1619 mov r10, rcx ; pEFlags
1620 mov r11, rdx ; pu64RbxRcx (is also T1)
1621
1622 mov rbx, [r11]
1623 mov rcx, [r11 + 8]
1624 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1625 mov rax, [rsi]
1626 mov rdx, [rsi + 8]
1627
1628 lock cmpxchg16b [rdi]
1629
1630 mov [rsi], rax
1631 mov [rsi + 8], rdx
1632 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1633
1634 pop rbx
1635 ret
1636
1637 %endif
1638ENDPROC iemAImpl_cmpxchg16b_locked
1639
1640%endif ; RT_ARCH_AMD64
1641
1642
1643;
1644; CMPXCHG.
1645;
1646; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1647;
1648; C-proto:
1649; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1650;
1651BEGINCODE
1652%macro IEMIMPL_CMPXCHG 2
1653BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1654 PROLOGUE_4_ARGS
1655 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1656 mov al, [A1]
1657 %1 cmpxchg [A0], A2_8
1658 mov [A1], al
1659 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1660 EPILOGUE_4_ARGS
1661ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1662
1663BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1664 PROLOGUE_4_ARGS
1665 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1666 mov ax, [A1]
1667 %1 cmpxchg [A0], A2_16
1668 mov [A1], ax
1669 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1670 EPILOGUE_4_ARGS
1671ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1672
1673BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1674 PROLOGUE_4_ARGS
1675 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1676 mov eax, [A1]
1677 %1 cmpxchg [A0], A2_32
1678 mov [A1], eax
1679 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1680 EPILOGUE_4_ARGS
1681ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1682
1683BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1684%ifdef RT_ARCH_AMD64
1685 PROLOGUE_4_ARGS
1686 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1687 mov rax, [A1]
1688 %1 cmpxchg [A0], A2
1689 mov [A1], rax
1690 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1691 EPILOGUE_4_ARGS
1692%else
1693 ;
1694 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1695 ;
1696 push esi
1697 push edi
1698 push ebx
1699 push ebp
1700
1701 mov edi, ecx ; pu64Dst
1702 mov esi, edx ; pu64Rax
1703 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1704 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1705
1706 mov ebx, [ecx]
1707 mov ecx, [ecx + 4]
1708 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1709 mov eax, [esi]
1710 mov edx, [esi + 4]
1711
1712 lock cmpxchg8b [edi]
1713
1714 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1715 jz .cmpxchg8b_not_equal
1716 cmp eax, eax ; just set the other flags.
1717.store:
1718 mov [esi], eax
1719 mov [esi + 4], edx
1720 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1721
1722 pop ebp
1723 pop ebx
1724 pop edi
1725 pop esi
1726 ret 8
1727
1728.cmpxchg8b_not_equal:
1729 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1730 jne .store
1731 cmp [esi], eax
1732 jmp .store
1733
1734%endif
1735ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1736%endmacro ; IEMIMPL_CMPXCHG
1737
1738IEMIMPL_CMPXCHG , ,
1739IEMIMPL_CMPXCHG lock, _locked
1740
1741;;
1742; Macro for implementing a unary operator.
1743;
1744; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1745; variants, except on 32-bit system where the 64-bit accesses requires hand
1746; coding.
1747;
1748; All the functions takes a pointer to the destination memory operand in A0,
1749; the source register operand in A1 and a pointer to eflags in A2.
1750;
1751; @param 1 The instruction mnemonic.
1752; @param 2 The modified flags.
1753; @param 3 The undefined flags.
1754;
1755%macro IEMIMPL_UNARY_OP 3
1756BEGINCODE
1757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1758 PROLOGUE_2_ARGS
1759 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1760 %1 byte [A0]
1761 IEM_SAVE_FLAGS A1, %2, %3
1762 EPILOGUE_2_ARGS
1763ENDPROC iemAImpl_ %+ %1 %+ _u8
1764
1765BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1766 PROLOGUE_2_ARGS
1767 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1768 lock %1 byte [A0]
1769 IEM_SAVE_FLAGS A1, %2, %3
1770 EPILOGUE_2_ARGS
1771ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1772
1773BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1774 PROLOGUE_2_ARGS
1775 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1776 %1 word [A0]
1777 IEM_SAVE_FLAGS A1, %2, %3
1778 EPILOGUE_2_ARGS
1779ENDPROC iemAImpl_ %+ %1 %+ _u16
1780
1781BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1782 PROLOGUE_2_ARGS
1783 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1784 lock %1 word [A0]
1785 IEM_SAVE_FLAGS A1, %2, %3
1786 EPILOGUE_2_ARGS
1787ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1788
1789BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1790 PROLOGUE_2_ARGS
1791 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1792 %1 dword [A0]
1793 IEM_SAVE_FLAGS A1, %2, %3
1794 EPILOGUE_2_ARGS
1795ENDPROC iemAImpl_ %+ %1 %+ _u32
1796
1797BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1798 PROLOGUE_2_ARGS
1799 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1800 lock %1 dword [A0]
1801 IEM_SAVE_FLAGS A1, %2, %3
1802 EPILOGUE_2_ARGS
1803ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1804
1805 %ifdef RT_ARCH_AMD64
1806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1807 PROLOGUE_2_ARGS
1808 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1809 %1 qword [A0]
1810 IEM_SAVE_FLAGS A1, %2, %3
1811 EPILOGUE_2_ARGS
1812ENDPROC iemAImpl_ %+ %1 %+ _u64
1813
1814BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1815 PROLOGUE_2_ARGS
1816 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1817 lock %1 qword [A0]
1818 IEM_SAVE_FLAGS A1, %2, %3
1819 EPILOGUE_2_ARGS
1820ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1821 %endif ; RT_ARCH_AMD64
1822
1823%endmacro
1824
1825IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1826IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1827IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1828IEMIMPL_UNARY_OP not, 0, 0
1829
1830
1831;
1832; BSWAP. No flag changes.
1833;
1834; Each function takes one argument, pointer to the value to bswap
1835; (input/output). They all return void.
1836;
1837BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1838 PROLOGUE_1_ARGS
1839 mov T0_32, [A0] ; just in case any of the upper bits are used.
1840 db 66h
1841 bswap T0_32
1842 mov [A0], T0_32
1843 EPILOGUE_1_ARGS
1844ENDPROC iemAImpl_bswap_u16
1845
1846BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1847 PROLOGUE_1_ARGS
1848 mov T0_32, [A0]
1849 bswap T0_32
1850 mov [A0], T0_32
1851 EPILOGUE_1_ARGS
1852ENDPROC iemAImpl_bswap_u32
1853
1854BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1855%ifdef RT_ARCH_AMD64
1856 PROLOGUE_1_ARGS
1857 mov T0, [A0]
1858 bswap T0
1859 mov [A0], T0
1860 EPILOGUE_1_ARGS
1861%else
1862 PROLOGUE_1_ARGS
1863 mov T0, [A0]
1864 mov T1, [A0 + 4]
1865 bswap T0
1866 bswap T1
1867 mov [A0 + 4], T0
1868 mov [A0], T1
1869 EPILOGUE_1_ARGS
1870%endif
1871ENDPROC iemAImpl_bswap_u64
1872
1873
1874;;
1875; Macro for implementing a shift operation.
1876;
1877; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1878; 32-bit system where the 64-bit accesses requires hand coding.
1879;
1880; All the functions takes a pointer to the destination memory operand in A0,
1881; the shift count in A1 and a pointer to eflags in A2.
1882;
1883; @param 1 The instruction mnemonic.
1884; @param 2 The modified flags.
1885; @param 3 The undefined flags.
1886;
1887; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1888;
1889; @note the _intel and _amd variants are implemented in C.
1890;
1891%macro IEMIMPL_SHIFT_OP 3
1892BEGINCODE
1893BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1894 PROLOGUE_3_ARGS
1895 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1896 %ifdef ASM_CALL64_GCC
1897 mov cl, A1_8
1898 %1 byte [A0], cl
1899 %else
1900 xchg A1, A0
1901 %1 byte [A1], cl
1902 %endif
1903 IEM_SAVE_FLAGS A2, %2, %3
1904 EPILOGUE_3_ARGS
1905ENDPROC iemAImpl_ %+ %1 %+ _u8
1906
1907BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1908 PROLOGUE_3_ARGS
1909 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1910 %ifdef ASM_CALL64_GCC
1911 mov cl, A1_8
1912 %1 word [A0], cl
1913 %else
1914 xchg A1, A0
1915 %1 word [A1], cl
1916 %endif
1917 IEM_SAVE_FLAGS A2, %2, %3
1918 EPILOGUE_3_ARGS
1919ENDPROC iemAImpl_ %+ %1 %+ _u16
1920
1921BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1922 PROLOGUE_3_ARGS
1923 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1924 %ifdef ASM_CALL64_GCC
1925 mov cl, A1_8
1926 %1 dword [A0], cl
1927 %else
1928 xchg A1, A0
1929 %1 dword [A1], cl
1930 %endif
1931 IEM_SAVE_FLAGS A2, %2, %3
1932 EPILOGUE_3_ARGS
1933ENDPROC iemAImpl_ %+ %1 %+ _u32
1934
1935 %ifdef RT_ARCH_AMD64
1936BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1937 PROLOGUE_3_ARGS
1938 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1939 %ifdef ASM_CALL64_GCC
1940 mov cl, A1_8
1941 %1 qword [A0], cl
1942 %else
1943 xchg A1, A0
1944 %1 qword [A1], cl
1945 %endif
1946 IEM_SAVE_FLAGS A2, %2, %3
1947 EPILOGUE_3_ARGS
1948ENDPROC iemAImpl_ %+ %1 %+ _u64
1949 %endif ; RT_ARCH_AMD64
1950
1951%endmacro
1952
1953IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1954IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1955IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1956IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1957IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1958IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1959IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1960
1961
1962;;
1963; Macro for implementing a double precision shift operation.
1964;
1965; This will generate code for the 16, 32 and 64 bit accesses, except on
1966; 32-bit system where the 64-bit accesses requires hand coding.
1967;
1968; The functions takes the destination operand (r/m) in A0, the source (reg) in
1969; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1970;
1971; @param 1 The instruction mnemonic.
1972; @param 2 The modified flags.
1973; @param 3 The undefined flags.
1974;
1975; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1976;
1977; @note the _intel and _amd variants are implemented in C.
1978;
1979%macro IEMIMPL_SHIFT_DBL_OP 3
1980BEGINCODE
1981BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1982 PROLOGUE_4_ARGS
1983 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1984 %ifdef ASM_CALL64_GCC
1985 xchg A3, A2
1986 %1 [A0], A1_16, cl
1987 xchg A3, A2
1988 %else
1989 xchg A0, A2
1990 %1 [A2], A1_16, cl
1991 %endif
1992 IEM_SAVE_FLAGS A3, %2, %3
1993 EPILOGUE_4_ARGS
1994ENDPROC iemAImpl_ %+ %1 %+ _u16
1995
1996BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1997 PROLOGUE_4_ARGS
1998 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1999 %ifdef ASM_CALL64_GCC
2000 xchg A3, A2
2001 %1 [A0], A1_32, cl
2002 xchg A3, A2
2003 %else
2004 xchg A0, A2
2005 %1 [A2], A1_32, cl
2006 %endif
2007 IEM_SAVE_FLAGS A3, %2, %3
2008 EPILOGUE_4_ARGS
2009ENDPROC iemAImpl_ %+ %1 %+ _u32
2010
2011 %ifdef RT_ARCH_AMD64
2012BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2013 PROLOGUE_4_ARGS
2014 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2015 %ifdef ASM_CALL64_GCC
2016 xchg A3, A2
2017 %1 [A0], A1, cl
2018 xchg A3, A2
2019 %else
2020 xchg A0, A2
2021 %1 [A2], A1, cl
2022 %endif
2023 IEM_SAVE_FLAGS A3, %2, %3
2024 EPILOGUE_4_ARGS_EX 12
2025ENDPROC iemAImpl_ %+ %1 %+ _u64
2026 %endif ; RT_ARCH_AMD64
2027
2028%endmacro
2029
2030IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2031IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2032
2033
2034;;
2035; Macro for implementing a multiplication operations.
2036;
2037; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2038; 32-bit system where the 64-bit accesses requires hand coding.
2039;
2040; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2041; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2042; pointer to eflags in A3.
2043;
2044; The functions all return 0 so the caller can be used for div/idiv as well as
2045; for the mul/imul implementation.
2046;
2047; @param 1 The instruction mnemonic.
2048; @param 2 The modified flags.
2049; @param 3 The undefined flags.
2050; @param 4 Name suffix.
2051; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2052;
2053; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2054;
2055%macro IEMIMPL_MUL_OP 5
2056BEGINCODE
2057BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2058 PROLOGUE_3_ARGS
2059 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2060 mov al, [A0]
2061 %1 A1_8
2062 mov [A0], ax
2063 %if %5 != 1
2064 IEM_SAVE_FLAGS A2, %2, %3
2065 %else
2066 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2067 %endif
2068 xor eax, eax
2069 EPILOGUE_3_ARGS
2070ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2071
2072BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2073 PROLOGUE_4_ARGS
2074 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2075 mov ax, [A0]
2076 %ifdef ASM_CALL64_GCC
2077 %1 A2_16
2078 mov [A0], ax
2079 mov [A1], dx
2080 %else
2081 mov T1, A1
2082 %1 A2_16
2083 mov [A0], ax
2084 mov [T1], dx
2085 %endif
2086 %if %5 != 1
2087 IEM_SAVE_FLAGS A3, %2, %3
2088 %else
2089 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2090 %endif
2091 xor eax, eax
2092 EPILOGUE_4_ARGS
2093ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2094
2095BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2096 PROLOGUE_4_ARGS
2097 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2098 mov eax, [A0]
2099 %ifdef ASM_CALL64_GCC
2100 %1 A2_32
2101 mov [A0], eax
2102 mov [A1], edx
2103 %else
2104 mov T1, A1
2105 %1 A2_32
2106 mov [A0], eax
2107 mov [T1], edx
2108 %endif
2109 %if %5 != 1
2110 IEM_SAVE_FLAGS A3, %2, %3
2111 %else
2112 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2113 %endif
2114 xor eax, eax
2115 EPILOGUE_4_ARGS
2116ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2117
2118 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2119BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2120 PROLOGUE_4_ARGS
2121 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2122 mov rax, [A0]
2123 %ifdef ASM_CALL64_GCC
2124 %1 A2
2125 mov [A0], rax
2126 mov [A1], rdx
2127 %else
2128 mov T1, A1
2129 %1 A2
2130 mov [A0], rax
2131 mov [T1], rdx
2132 %endif
2133 %if %5 != 1
2134 IEM_SAVE_FLAGS A3, %2, %3
2135 %else
2136 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2137 %endif
2138 xor eax, eax
2139 EPILOGUE_4_ARGS_EX 12
2140ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2141 %endif ; !RT_ARCH_AMD64
2142
2143%endmacro
2144
2145IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2146IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2147IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2148IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2149IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2150IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2151
2152
2153BEGINCODE
2154;;
2155; Worker function for negating a 32-bit number in T1:T0
2156; @uses None (T0,T1)
2157BEGINPROC iemAImpl_negate_T0_T1_u32
2158 push 0
2159 push 0
2160 xchg T0_32, [xSP]
2161 xchg T1_32, [xSP + xCB]
2162 sub T0_32, [xSP]
2163 sbb T1_32, [xSP + xCB]
2164 add xSP, xCB*2
2165 ret
2166ENDPROC iemAImpl_negate_T0_T1_u32
2167
2168%ifdef RT_ARCH_AMD64
2169;;
2170; Worker function for negating a 64-bit number in T1:T0
2171; @uses None (T0,T1)
2172BEGINPROC iemAImpl_negate_T0_T1_u64
2173 push 0
2174 push 0
2175 xchg T0, [xSP]
2176 xchg T1, [xSP + xCB]
2177 sub T0, [xSP]
2178 sbb T1, [xSP + xCB]
2179 add xSP, xCB*2
2180 ret
2181ENDPROC iemAImpl_negate_T0_T1_u64
2182%endif
2183
2184
2185;;
2186; Macro for implementing a division operations.
2187;
2188; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2189; 32-bit system where the 64-bit accesses requires hand coding.
2190;
2191; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2192; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2193; pointer to eflags in A3.
2194;
2195; The functions all return 0 on success and -1 if a divide error should be
2196; raised by the caller.
2197;
2198; @param 1 The instruction mnemonic.
2199; @param 2 The modified flags.
2200; @param 3 The undefined flags.
2201; @param 4 1 if signed, 0 if unsigned.
2202; @param 5 Function suffix.
2203; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2204; 2 for AMD (set AF, clear PF, ZF and SF).
2205;
2206; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2207;
2208%macro IEMIMPL_DIV_OP 6
2209BEGINCODE
2210BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2211 PROLOGUE_3_ARGS
2212
2213 ; div by chainsaw check.
2214 test A1_8, A1_8
2215 jz .div_zero
2216
2217 ; Overflow check - unsigned division is simple to verify, haven't
2218 ; found a simple way to check signed division yet unfortunately.
2219 %if %4 == 0
2220 cmp [A0 + 1], A1_8
2221 jae .div_overflow
2222 %else
2223 mov T0_16, [A0] ; T0 = dividend
2224 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2225 test A1_8, A1_8
2226 js .divisor_negative
2227 test T0_16, T0_16
2228 jns .both_positive
2229 neg T0_16
2230.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2231 push T0 ; Start off like unsigned below.
2232 shr T0_16, 7
2233 cmp T0_8, A1_8
2234 pop T0
2235 jb .div_no_overflow
2236 ja .div_overflow
2237 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2238 cmp T0_8, A1_8
2239 jae .div_overflow
2240 jmp .div_no_overflow
2241
2242.divisor_negative:
2243 neg A1_8
2244 test T0_16, T0_16
2245 jns .one_of_each
2246 neg T0_16
2247.both_positive: ; Same as unsigned shifted by sign indicator bit.
2248 shr T0_16, 7
2249 cmp T0_8, A1_8
2250 jae .div_overflow
2251.div_no_overflow:
2252 mov A1, T1 ; restore divisor
2253 %endif
2254
2255 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2256 mov ax, [A0]
2257 %1 A1_8
2258 mov [A0], ax
2259 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2260 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2261 %else
2262 IEM_SAVE_FLAGS A2, %2, %3
2263 %endif
2264 xor eax, eax
2265
2266.return:
2267 EPILOGUE_3_ARGS
2268
2269.div_zero:
2270.div_overflow:
2271 mov eax, -1
2272 jmp .return
2273ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2274
2275BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2276 PROLOGUE_4_ARGS
2277
2278 ; div by chainsaw check.
2279 test A2_16, A2_16
2280 jz .div_zero
2281
2282 ; Overflow check - unsigned division is simple to verify, haven't
2283 ; found a simple way to check signed division yet unfortunately.
2284 %if %4 == 0
2285 cmp [A1], A2_16
2286 jae .div_overflow
2287 %else
2288 mov T0_16, [A1]
2289 shl T0_32, 16
2290 mov T0_16, [A0] ; T0 = dividend
2291 mov T1, A2 ; T1 = divisor
2292 test T1_16, T1_16
2293 js .divisor_negative
2294 test T0_32, T0_32
2295 jns .both_positive
2296 neg T0_32
2297.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2298 push T0 ; Start off like unsigned below.
2299 shr T0_32, 15
2300 cmp T0_16, T1_16
2301 pop T0
2302 jb .div_no_overflow
2303 ja .div_overflow
2304 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2305 cmp T0_16, T1_16
2306 jae .div_overflow
2307 jmp .div_no_overflow
2308
2309.divisor_negative:
2310 neg T1_16
2311 test T0_32, T0_32
2312 jns .one_of_each
2313 neg T0_32
2314.both_positive: ; Same as unsigned shifted by sign indicator bit.
2315 shr T0_32, 15
2316 cmp T0_16, T1_16
2317 jae .div_overflow
2318.div_no_overflow:
2319 %endif
2320
2321 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2322 %ifdef ASM_CALL64_GCC
2323 mov T1, A2
2324 mov ax, [A0]
2325 mov dx, [A1]
2326 %1 T1_16
2327 mov [A0], ax
2328 mov [A1], dx
2329 %else
2330 mov T1, A1
2331 mov ax, [A0]
2332 mov dx, [T1]
2333 %1 A2_16
2334 mov [A0], ax
2335 mov [T1], dx
2336 %endif
2337 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2338 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2339 %else
2340 IEM_SAVE_FLAGS A3, %2, %3
2341 %endif
2342 xor eax, eax
2343
2344.return:
2345 EPILOGUE_4_ARGS
2346
2347.div_zero:
2348.div_overflow:
2349 mov eax, -1
2350 jmp .return
2351ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2352
2353BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2354 PROLOGUE_4_ARGS
2355
2356 ; div by chainsaw check.
2357 test A2_32, A2_32
2358 jz .div_zero
2359
2360 ; Overflow check - unsigned division is simple to verify, haven't
2361 ; found a simple way to check signed division yet unfortunately.
2362 %if %4 == 0
2363 cmp [A1], A2_32
2364 jae .div_overflow
2365 %else
2366 push A2 ; save A2 so we modify it (we out of regs on x86).
2367 mov T0_32, [A0] ; T0 = dividend low
2368 mov T1_32, [A1] ; T1 = dividend high
2369 test A2_32, A2_32
2370 js .divisor_negative
2371 test T1_32, T1_32
2372 jns .both_positive
2373 call NAME(iemAImpl_negate_T0_T1_u32)
2374.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2375 push T0 ; Start off like unsigned below.
2376 shl T1_32, 1
2377 shr T0_32, 31
2378 or T1_32, T0_32
2379 cmp T1_32, A2_32
2380 pop T0
2381 jb .div_no_overflow
2382 ja .div_overflow
2383 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2384 cmp T0_32, A2_32
2385 jae .div_overflow
2386 jmp .div_no_overflow
2387
2388.divisor_negative:
2389 neg A2_32
2390 test T1_32, T1_32
2391 jns .one_of_each
2392 call NAME(iemAImpl_negate_T0_T1_u32)
2393.both_positive: ; Same as unsigned shifted by sign indicator bit.
2394 shl T1_32, 1
2395 shr T0_32, 31
2396 or T1_32, T0_32
2397 cmp T1_32, A2_32
2398 jae .div_overflow
2399.div_no_overflow:
2400 pop A2
2401 %endif
2402
2403 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2404 mov eax, [A0]
2405 %ifdef ASM_CALL64_GCC
2406 mov T1, A2
2407 mov eax, [A0]
2408 mov edx, [A1]
2409 %1 T1_32
2410 mov [A0], eax
2411 mov [A1], edx
2412 %else
2413 mov T1, A1
2414 mov eax, [A0]
2415 mov edx, [T1]
2416 %1 A2_32
2417 mov [A0], eax
2418 mov [T1], edx
2419 %endif
2420 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2421 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2422 %else
2423 IEM_SAVE_FLAGS A3, %2, %3
2424 %endif
2425 xor eax, eax
2426
2427.return:
2428 EPILOGUE_4_ARGS
2429
2430.div_overflow:
2431 %if %4 != 0
2432 pop A2
2433 %endif
2434.div_zero:
2435 mov eax, -1
2436 jmp .return
2437ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2438
2439 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2440BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2441 PROLOGUE_4_ARGS
2442
2443 test A2, A2
2444 jz .div_zero
2445 %if %4 == 0
2446 cmp [A1], A2
2447 jae .div_overflow
2448 %else
2449 push A2 ; save A2 so we modify it (we out of regs on x86).
2450 mov T0, [A0] ; T0 = dividend low
2451 mov T1, [A1] ; T1 = dividend high
2452 test A2, A2
2453 js .divisor_negative
2454 test T1, T1
2455 jns .both_positive
2456 call NAME(iemAImpl_negate_T0_T1_u64)
2457.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2458 push T0 ; Start off like unsigned below.
2459 shl T1, 1
2460 shr T0, 63
2461 or T1, T0
2462 cmp T1, A2
2463 pop T0
2464 jb .div_no_overflow
2465 ja .div_overflow
2466 mov T1, 0x7fffffffffffffff
2467 and T0, T1 ; Special case for covering (divisor - 1).
2468 cmp T0, A2
2469 jae .div_overflow
2470 jmp .div_no_overflow
2471
2472.divisor_negative:
2473 neg A2
2474 test T1, T1
2475 jns .one_of_each
2476 call NAME(iemAImpl_negate_T0_T1_u64)
2477.both_positive: ; Same as unsigned shifted by sign indicator bit.
2478 shl T1, 1
2479 shr T0, 63
2480 or T1, T0
2481 cmp T1, A2
2482 jae .div_overflow
2483.div_no_overflow:
2484 pop A2
2485 %endif
2486
2487 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2488 mov rax, [A0]
2489 %ifdef ASM_CALL64_GCC
2490 mov T1, A2
2491 mov rax, [A0]
2492 mov rdx, [A1]
2493 %1 T1
2494 mov [A0], rax
2495 mov [A1], rdx
2496 %else
2497 mov T1, A1
2498 mov rax, [A0]
2499 mov rdx, [T1]
2500 %1 A2
2501 mov [A0], rax
2502 mov [T1], rdx
2503 %endif
2504 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2505 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2506 %else
2507 IEM_SAVE_FLAGS A3, %2, %3
2508 %endif
2509 xor eax, eax
2510
2511.return:
2512 EPILOGUE_4_ARGS_EX 12
2513
2514.div_overflow:
2515 %if %4 != 0
2516 pop A2
2517 %endif
2518.div_zero:
2519 mov eax, -1
2520 jmp .return
2521ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2522 %endif ; !RT_ARCH_AMD64
2523
2524%endmacro
2525
2526IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2527IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2528IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2529IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2530IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2531IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2532
2533
2534;;
2535; Macro for implementing memory fence operation.
2536;
2537; No return value, no operands or anything.
2538;
2539; @param 1 The instruction.
2540;
2541%macro IEMIMPL_MEM_FENCE 1
2542BEGINCODE
2543BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2544 %1
2545 ret
2546ENDPROC iemAImpl_ %+ %1
2547%endmacro
2548
2549IEMIMPL_MEM_FENCE lfence
2550IEMIMPL_MEM_FENCE sfence
2551IEMIMPL_MEM_FENCE mfence
2552
2553;;
2554; Alternative for non-SSE2 host.
2555;
2556BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2557 push xAX
2558 xchg xAX, [xSP]
2559 add xSP, xCB
2560 ret
2561ENDPROC iemAImpl_alt_mem_fence
2562
2563
2564;;
2565; Initialize the FPU for the actual instruction being emulated, this means
2566; loading parts of the guest's control word and status word.
2567;
2568; @uses 24 bytes of stack. T0, T1
2569; @param 1 Expression giving the address of the FXSTATE of the guest.
2570;
2571%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2572 fnstenv [xSP]
2573
2574 ; FCW - for exception, precision and rounding control.
2575 movzx T0, word [%1 + X86FXSTATE.FCW]
2576 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2577 mov [xSP + X86FSTENV32P.FCW], T0_16
2578
2579 ; FSW - for undefined C0, C1, C2, and C3.
2580 movzx T1, word [%1 + X86FXSTATE.FSW]
2581 and T1, X86_FSW_C_MASK
2582 movzx T0, word [xSP + X86FSTENV32P.FSW]
2583 and T0, X86_FSW_TOP_MASK
2584 or T0, T1
2585 mov [xSP + X86FSTENV32P.FSW], T0_16
2586
2587 fldenv [xSP]
2588%endmacro
2589
2590
2591;;
2592; Initialize the FPU for the actual instruction being emulated, this means
2593; loading parts of the guest's control word, status word, and update the
2594; tag word for the top register if it's empty.
2595;
2596; ASSUMES actual TOP=7
2597;
2598; @uses 24 bytes of stack. T0, T1
2599; @param 1 Expression giving the address of the FXSTATE of the guest.
2600;
2601%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2602 fnstenv [xSP]
2603
2604 ; FCW - for exception, precision and rounding control.
2605 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2606 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2607 mov [xSP + X86FSTENV32P.FCW], T0_16
2608
2609 ; FSW - for undefined C0, C1, C2, and C3.
2610 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2611 and T1_32, X86_FSW_C_MASK
2612 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2613 and T0_32, X86_FSW_TOP_MASK
2614 or T0_32, T1_32
2615 mov [xSP + X86FSTENV32P.FSW], T0_16
2616
2617 ; FTW - Only for ST0 (in/out).
2618 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2619 shr T1_32, X86_FSW_TOP_SHIFT
2620 and T1_32, X86_FSW_TOP_SMASK
2621 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2622 jc %%st0_not_empty
2623 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2624%%st0_not_empty:
2625
2626 fldenv [xSP]
2627%endmacro
2628
2629
2630;;
2631; Need to move this as well somewhere better?
2632;
2633struc IEMFPURESULT
2634 .r80Result resw 5
2635 .FSW resw 1
2636endstruc
2637
2638
2639;;
2640; Need to move this as well somewhere better?
2641;
2642struc IEMFPURESULTTWO
2643 .r80Result1 resw 5
2644 .FSW resw 1
2645 .r80Result2 resw 5
2646endstruc
2647
2648
2649;
2650;---------------------- 16-bit signed integer operations ----------------------
2651;
2652
2653
2654;;
2655; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2656;
2657; @param A0 FPU context (fxsave).
2658; @param A1 Pointer to a IEMFPURESULT for the output.
2659; @param A2 Pointer to the 16-bit floating point value to convert.
2660;
2661BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2662 PROLOGUE_3_ARGS
2663 sub xSP, 20h
2664
2665 fninit
2666 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2667 fild word [A2]
2668
2669 fnstsw word [A1 + IEMFPURESULT.FSW]
2670 fnclex
2671 fstp tword [A1 + IEMFPURESULT.r80Result]
2672
2673 fninit
2674 add xSP, 20h
2675 EPILOGUE_3_ARGS
2676ENDPROC iemAImpl_fild_r80_from_i16
2677
2678
2679;;
2680; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2681;
2682; @param A0 FPU context (fxsave).
2683; @param A1 Where to return the output FSW.
2684; @param A2 Where to store the 16-bit signed integer value.
2685; @param A3 Pointer to the 80-bit value.
2686;
2687BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2688 PROLOGUE_4_ARGS
2689 sub xSP, 20h
2690
2691 fninit
2692 fld tword [A3]
2693 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2694 fistp word [A2]
2695
2696 fnstsw word [A1]
2697
2698 fninit
2699 add xSP, 20h
2700 EPILOGUE_4_ARGS
2701ENDPROC iemAImpl_fist_r80_to_i16
2702
2703
2704;;
2705; Store a 80-bit floating point value (register) as a 16-bit signed integer
2706; (memory) with truncation.
2707;
2708; @param A0 FPU context (fxsave).
2709; @param A1 Where to return the output FSW.
2710; @param A2 Where to store the 16-bit signed integer value.
2711; @param A3 Pointer to the 80-bit value.
2712;
2713BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2714 PROLOGUE_4_ARGS
2715 sub xSP, 20h
2716
2717 fninit
2718 fld tword [A3]
2719 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2720 fisttp word [A2]
2721
2722 fnstsw word [A1]
2723
2724 fninit
2725 add xSP, 20h
2726 EPILOGUE_4_ARGS
2727ENDPROC iemAImpl_fistt_r80_to_i16
2728
2729
2730;;
2731; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2732;
2733; @param 1 The instruction
2734;
2735; @param A0 FPU context (fxsave).
2736; @param A1 Pointer to a IEMFPURESULT for the output.
2737; @param A2 Pointer to the 80-bit value.
2738; @param A3 Pointer to the 16-bit value.
2739;
2740%macro IEMIMPL_FPU_R80_BY_I16 1
2741BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2742 PROLOGUE_4_ARGS
2743 sub xSP, 20h
2744
2745 fninit
2746 fld tword [A2]
2747 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2748 %1 word [A3]
2749
2750 fnstsw word [A1 + IEMFPURESULT.FSW]
2751 fnclex
2752 fstp tword [A1 + IEMFPURESULT.r80Result]
2753
2754 fninit
2755 add xSP, 20h
2756 EPILOGUE_4_ARGS
2757ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2758%endmacro
2759
2760IEMIMPL_FPU_R80_BY_I16 fiadd
2761IEMIMPL_FPU_R80_BY_I16 fimul
2762IEMIMPL_FPU_R80_BY_I16 fisub
2763IEMIMPL_FPU_R80_BY_I16 fisubr
2764IEMIMPL_FPU_R80_BY_I16 fidiv
2765IEMIMPL_FPU_R80_BY_I16 fidivr
2766
2767
2768;;
2769; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2770; only returning FSW.
2771;
2772; @param 1 The instruction
2773;
2774; @param A0 FPU context (fxsave).
2775; @param A1 Where to store the output FSW.
2776; @param A2 Pointer to the 80-bit value.
2777; @param A3 Pointer to the 64-bit value.
2778;
2779%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2780BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2781 PROLOGUE_4_ARGS
2782 sub xSP, 20h
2783
2784 fninit
2785 fld tword [A2]
2786 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2787 %1 word [A3]
2788
2789 fnstsw word [A1]
2790
2791 fninit
2792 add xSP, 20h
2793 EPILOGUE_4_ARGS
2794ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2795%endmacro
2796
2797IEMIMPL_FPU_R80_BY_I16_FSW ficom
2798
2799
2800
2801;
2802;---------------------- 32-bit signed integer operations ----------------------
2803;
2804
2805
2806;;
2807; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2808;
2809; @param A0 FPU context (fxsave).
2810; @param A1 Pointer to a IEMFPURESULT for the output.
2811; @param A2 Pointer to the 32-bit floating point value to convert.
2812;
2813BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2814 PROLOGUE_3_ARGS
2815 sub xSP, 20h
2816
2817 fninit
2818 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2819 fild dword [A2]
2820
2821 fnstsw word [A1 + IEMFPURESULT.FSW]
2822 fnclex
2823 fstp tword [A1 + IEMFPURESULT.r80Result]
2824
2825 fninit
2826 add xSP, 20h
2827 EPILOGUE_3_ARGS
2828ENDPROC iemAImpl_fild_r80_from_i32
2829
2830
2831;;
2832; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2833;
2834; @param A0 FPU context (fxsave).
2835; @param A1 Where to return the output FSW.
2836; @param A2 Where to store the 32-bit signed integer value.
2837; @param A3 Pointer to the 80-bit value.
2838;
2839BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2840 PROLOGUE_4_ARGS
2841 sub xSP, 20h
2842
2843 fninit
2844 fld tword [A3]
2845 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2846 fistp dword [A2]
2847
2848 fnstsw word [A1]
2849
2850 fninit
2851 add xSP, 20h
2852 EPILOGUE_4_ARGS
2853ENDPROC iemAImpl_fist_r80_to_i32
2854
2855
2856;;
2857; Store a 80-bit floating point value (register) as a 32-bit signed integer
2858; (memory) with truncation.
2859;
2860; @param A0 FPU context (fxsave).
2861; @param A1 Where to return the output FSW.
2862; @param A2 Where to store the 32-bit signed integer value.
2863; @param A3 Pointer to the 80-bit value.
2864;
2865BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2866 PROLOGUE_4_ARGS
2867 sub xSP, 20h
2868
2869 fninit
2870 fld tword [A3]
2871 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2872 fisttp dword [A2]
2873
2874 fnstsw word [A1]
2875
2876 fninit
2877 add xSP, 20h
2878 EPILOGUE_4_ARGS
2879ENDPROC iemAImpl_fistt_r80_to_i32
2880
2881
2882;;
2883; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2884;
2885; @param 1 The instruction
2886;
2887; @param A0 FPU context (fxsave).
2888; @param A1 Pointer to a IEMFPURESULT for the output.
2889; @param A2 Pointer to the 80-bit value.
2890; @param A3 Pointer to the 32-bit value.
2891;
2892%macro IEMIMPL_FPU_R80_BY_I32 1
2893BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2894 PROLOGUE_4_ARGS
2895 sub xSP, 20h
2896
2897 fninit
2898 fld tword [A2]
2899 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2900 %1 dword [A3]
2901
2902 fnstsw word [A1 + IEMFPURESULT.FSW]
2903 fnclex
2904 fstp tword [A1 + IEMFPURESULT.r80Result]
2905
2906 fninit
2907 add xSP, 20h
2908 EPILOGUE_4_ARGS
2909ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2910%endmacro
2911
2912IEMIMPL_FPU_R80_BY_I32 fiadd
2913IEMIMPL_FPU_R80_BY_I32 fimul
2914IEMIMPL_FPU_R80_BY_I32 fisub
2915IEMIMPL_FPU_R80_BY_I32 fisubr
2916IEMIMPL_FPU_R80_BY_I32 fidiv
2917IEMIMPL_FPU_R80_BY_I32 fidivr
2918
2919
2920;;
2921; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2922; only returning FSW.
2923;
2924; @param 1 The instruction
2925;
2926; @param A0 FPU context (fxsave).
2927; @param A1 Where to store the output FSW.
2928; @param A2 Pointer to the 80-bit value.
2929; @param A3 Pointer to the 64-bit value.
2930;
2931%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2932BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2933 PROLOGUE_4_ARGS
2934 sub xSP, 20h
2935
2936 fninit
2937 fld tword [A2]
2938 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2939 %1 dword [A3]
2940
2941 fnstsw word [A1]
2942
2943 fninit
2944 add xSP, 20h
2945 EPILOGUE_4_ARGS
2946ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2947%endmacro
2948
2949IEMIMPL_FPU_R80_BY_I32_FSW ficom
2950
2951
2952
2953;
2954;---------------------- 64-bit signed integer operations ----------------------
2955;
2956
2957
2958;;
2959; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2960;
2961; @param A0 FPU context (fxsave).
2962; @param A1 Pointer to a IEMFPURESULT for the output.
2963; @param A2 Pointer to the 64-bit floating point value to convert.
2964;
2965BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2966 PROLOGUE_3_ARGS
2967 sub xSP, 20h
2968
2969 fninit
2970 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2971 fild qword [A2]
2972
2973 fnstsw word [A1 + IEMFPURESULT.FSW]
2974 fnclex
2975 fstp tword [A1 + IEMFPURESULT.r80Result]
2976
2977 fninit
2978 add xSP, 20h
2979 EPILOGUE_3_ARGS
2980ENDPROC iemAImpl_fild_r80_from_i64
2981
2982
2983;;
2984; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2985;
2986; @param A0 FPU context (fxsave).
2987; @param A1 Where to return the output FSW.
2988; @param A2 Where to store the 64-bit signed integer value.
2989; @param A3 Pointer to the 80-bit value.
2990;
2991BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2992 PROLOGUE_4_ARGS
2993 sub xSP, 20h
2994
2995 fninit
2996 fld tword [A3]
2997 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2998 fistp qword [A2]
2999
3000 fnstsw word [A1]
3001
3002 fninit
3003 add xSP, 20h
3004 EPILOGUE_4_ARGS
3005ENDPROC iemAImpl_fist_r80_to_i64
3006
3007
3008;;
3009; Store a 80-bit floating point value (register) as a 64-bit signed integer
3010; (memory) with truncation.
3011;
3012; @param A0 FPU context (fxsave).
3013; @param A1 Where to return the output FSW.
3014; @param A2 Where to store the 64-bit signed integer value.
3015; @param A3 Pointer to the 80-bit value.
3016;
3017BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3018 PROLOGUE_4_ARGS
3019 sub xSP, 20h
3020
3021 fninit
3022 fld tword [A3]
3023 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3024 fisttp qword [A2]
3025
3026 fnstsw word [A1]
3027
3028 fninit
3029 add xSP, 20h
3030 EPILOGUE_4_ARGS
3031ENDPROC iemAImpl_fistt_r80_to_i64
3032
3033
3034
3035;
3036;---------------------- 32-bit floating point operations ----------------------
3037;
3038
3039;;
3040; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3041;
3042; @param A0 FPU context (fxsave).
3043; @param A1 Pointer to a IEMFPURESULT for the output.
3044; @param A2 Pointer to the 32-bit floating point value to convert.
3045;
3046BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3047 PROLOGUE_3_ARGS
3048 sub xSP, 20h
3049
3050 fninit
3051 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3052 fld dword [A2]
3053
3054 fnstsw word [A1 + IEMFPURESULT.FSW]
3055 fnclex
3056 fstp tword [A1 + IEMFPURESULT.r80Result]
3057
3058 fninit
3059 add xSP, 20h
3060 EPILOGUE_3_ARGS
3061ENDPROC iemAImpl_fld_r80_from_r32
3062
3063
3064;;
3065; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3066;
3067; @param A0 FPU context (fxsave).
3068; @param A1 Where to return the output FSW.
3069; @param A2 Where to store the 32-bit value.
3070; @param A3 Pointer to the 80-bit value.
3071;
3072BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3073 PROLOGUE_4_ARGS
3074 sub xSP, 20h
3075
3076 fninit
3077 fld tword [A3]
3078 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3079 fst dword [A2]
3080
3081 fnstsw word [A1]
3082
3083 fninit
3084 add xSP, 20h
3085 EPILOGUE_4_ARGS
3086ENDPROC iemAImpl_fst_r80_to_r32
3087
3088
3089;;
3090; FPU instruction working on one 80-bit and one 32-bit floating point value.
3091;
3092; @param 1 The instruction
3093;
3094; @param A0 FPU context (fxsave).
3095; @param A1 Pointer to a IEMFPURESULT for the output.
3096; @param A2 Pointer to the 80-bit value.
3097; @param A3 Pointer to the 32-bit value.
3098;
3099%macro IEMIMPL_FPU_R80_BY_R32 1
3100BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3101 PROLOGUE_4_ARGS
3102 sub xSP, 20h
3103
3104 fninit
3105 fld tword [A2]
3106 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3107 %1 dword [A3]
3108
3109 fnstsw word [A1 + IEMFPURESULT.FSW]
3110 fnclex
3111 fstp tword [A1 + IEMFPURESULT.r80Result]
3112
3113 fninit
3114 add xSP, 20h
3115 EPILOGUE_4_ARGS
3116ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3117%endmacro
3118
3119IEMIMPL_FPU_R80_BY_R32 fadd
3120IEMIMPL_FPU_R80_BY_R32 fmul
3121IEMIMPL_FPU_R80_BY_R32 fsub
3122IEMIMPL_FPU_R80_BY_R32 fsubr
3123IEMIMPL_FPU_R80_BY_R32 fdiv
3124IEMIMPL_FPU_R80_BY_R32 fdivr
3125
3126
3127;;
3128; FPU instruction working on one 80-bit and one 32-bit floating point value,
3129; only returning FSW.
3130;
3131; @param 1 The instruction
3132;
3133; @param A0 FPU context (fxsave).
3134; @param A1 Where to store the output FSW.
3135; @param A2 Pointer to the 80-bit value.
3136; @param A3 Pointer to the 64-bit value.
3137;
3138%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3139BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3140 PROLOGUE_4_ARGS
3141 sub xSP, 20h
3142
3143 fninit
3144 fld tword [A2]
3145 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3146 %1 dword [A3]
3147
3148 fnstsw word [A1]
3149
3150 fninit
3151 add xSP, 20h
3152 EPILOGUE_4_ARGS
3153ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3154%endmacro
3155
3156IEMIMPL_FPU_R80_BY_R32_FSW fcom
3157
3158
3159
3160;
3161;---------------------- 64-bit floating point operations ----------------------
3162;
3163
3164;;
3165; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3166;
3167; @param A0 FPU context (fxsave).
3168; @param A1 Pointer to a IEMFPURESULT for the output.
3169; @param A2 Pointer to the 64-bit floating point value to convert.
3170;
3171BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3172 PROLOGUE_3_ARGS
3173 sub xSP, 20h
3174
3175 fninit
3176 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3177 fld qword [A2]
3178
3179 fnstsw word [A1 + IEMFPURESULT.FSW]
3180 fnclex
3181 fstp tword [A1 + IEMFPURESULT.r80Result]
3182
3183 fninit
3184 add xSP, 20h
3185 EPILOGUE_3_ARGS
3186ENDPROC iemAImpl_fld_r80_from_r64
3187
3188
3189;;
3190; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3191;
3192; @param A0 FPU context (fxsave).
3193; @param A1 Where to return the output FSW.
3194; @param A2 Where to store the 64-bit value.
3195; @param A3 Pointer to the 80-bit value.
3196;
3197BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3198 PROLOGUE_4_ARGS
3199 sub xSP, 20h
3200
3201 fninit
3202 fld tword [A3]
3203 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3204 fst qword [A2]
3205
3206 fnstsw word [A1]
3207
3208 fninit
3209 add xSP, 20h
3210 EPILOGUE_4_ARGS
3211ENDPROC iemAImpl_fst_r80_to_r64
3212
3213
3214;;
3215; FPU instruction working on one 80-bit and one 64-bit floating point value.
3216;
3217; @param 1 The instruction
3218;
3219; @param A0 FPU context (fxsave).
3220; @param A1 Pointer to a IEMFPURESULT for the output.
3221; @param A2 Pointer to the 80-bit value.
3222; @param A3 Pointer to the 64-bit value.
3223;
3224%macro IEMIMPL_FPU_R80_BY_R64 1
3225BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3226 PROLOGUE_4_ARGS
3227 sub xSP, 20h
3228
3229 fninit
3230 fld tword [A2]
3231 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3232 %1 qword [A3]
3233
3234 fnstsw word [A1 + IEMFPURESULT.FSW]
3235 fnclex
3236 fstp tword [A1 + IEMFPURESULT.r80Result]
3237
3238 fninit
3239 add xSP, 20h
3240 EPILOGUE_4_ARGS
3241ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3242%endmacro
3243
3244IEMIMPL_FPU_R80_BY_R64 fadd
3245IEMIMPL_FPU_R80_BY_R64 fmul
3246IEMIMPL_FPU_R80_BY_R64 fsub
3247IEMIMPL_FPU_R80_BY_R64 fsubr
3248IEMIMPL_FPU_R80_BY_R64 fdiv
3249IEMIMPL_FPU_R80_BY_R64 fdivr
3250
3251;;
3252; FPU instruction working on one 80-bit and one 64-bit floating point value,
3253; only returning FSW.
3254;
3255; @param 1 The instruction
3256;
3257; @param A0 FPU context (fxsave).
3258; @param A1 Where to store the output FSW.
3259; @param A2 Pointer to the 80-bit value.
3260; @param A3 Pointer to the 64-bit value.
3261;
3262%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3263BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3264 PROLOGUE_4_ARGS
3265 sub xSP, 20h
3266
3267 fninit
3268 fld tword [A2]
3269 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3270 %1 qword [A3]
3271
3272 fnstsw word [A1]
3273
3274 fninit
3275 add xSP, 20h
3276 EPILOGUE_4_ARGS
3277ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3278%endmacro
3279
3280IEMIMPL_FPU_R80_BY_R64_FSW fcom
3281
3282
3283
3284;
3285;---------------------- 80-bit floating point operations ----------------------
3286;
3287
3288;;
3289; Loads a 80-bit floating point register value from memory.
3290;
3291; @param A0 FPU context (fxsave).
3292; @param A1 Pointer to a IEMFPURESULT for the output.
3293; @param A2 Pointer to the 80-bit floating point value to load.
3294;
3295BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3296 PROLOGUE_3_ARGS
3297 sub xSP, 20h
3298
3299 fninit
3300 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3301 fld tword [A2]
3302
3303 fnstsw word [A1 + IEMFPURESULT.FSW]
3304 fnclex
3305 fstp tword [A1 + IEMFPURESULT.r80Result]
3306
3307 fninit
3308 add xSP, 20h
3309 EPILOGUE_3_ARGS
3310ENDPROC iemAImpl_fld_r80_from_r80
3311
3312
3313;;
3314; Store a 80-bit floating point register to memory
3315;
3316; @param A0 FPU context (fxsave).
3317; @param A1 Where to return the output FSW.
3318; @param A2 Where to store the 80-bit value.
3319; @param A3 Pointer to the 80-bit register value.
3320;
3321BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3322 PROLOGUE_4_ARGS
3323 sub xSP, 20h
3324
3325 fninit
3326 fld tword [A3]
3327 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3328 fstp tword [A2]
3329
3330 fnstsw word [A1]
3331
3332 fninit
3333 add xSP, 20h
3334 EPILOGUE_4_ARGS
3335ENDPROC iemAImpl_fst_r80_to_r80
3336
3337
3338;;
3339; Loads an 80-bit floating point register value in BCD format from memory.
3340;
3341; @param A0 FPU context (fxsave).
3342; @param A1 Pointer to a IEMFPURESULT for the output.
3343; @param A2 Pointer to the 80-bit BCD value to load.
3344;
3345BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3346 PROLOGUE_3_ARGS
3347 sub xSP, 20h
3348
3349 fninit
3350 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3351 fbld tword [A2]
3352
3353 fnstsw word [A1 + IEMFPURESULT.FSW]
3354 fnclex
3355 fstp tword [A1 + IEMFPURESULT.r80Result]
3356
3357 fninit
3358 add xSP, 20h
3359 EPILOGUE_3_ARGS
3360ENDPROC iemAImpl_fld_r80_from_d80
3361
3362
3363;;
3364; Store a 80-bit floating point register to memory as BCD
3365;
3366; @param A0 FPU context (fxsave).
3367; @param A1 Where to return the output FSW.
3368; @param A2 Where to store the 80-bit BCD value.
3369; @param A3 Pointer to the 80-bit register value.
3370;
3371BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3372 PROLOGUE_4_ARGS
3373 sub xSP, 20h
3374
3375 fninit
3376 fld tword [A3]
3377 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3378 fbstp tword [A2]
3379
3380 fnstsw word [A1]
3381
3382 fninit
3383 add xSP, 20h
3384 EPILOGUE_4_ARGS
3385ENDPROC iemAImpl_fst_r80_to_d80
3386
3387
3388;;
3389; FPU instruction working on two 80-bit floating point values.
3390;
3391; @param 1 The instruction
3392;
3393; @param A0 FPU context (fxsave).
3394; @param A1 Pointer to a IEMFPURESULT for the output.
3395; @param A2 Pointer to the first 80-bit value (ST0)
3396; @param A3 Pointer to the second 80-bit value (STn).
3397;
3398%macro IEMIMPL_FPU_R80_BY_R80 2
3399BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3400 PROLOGUE_4_ARGS
3401 sub xSP, 20h
3402
3403 fninit
3404 fld tword [A3]
3405 fld tword [A2]
3406 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3407 %1 %2
3408
3409 fnstsw word [A1 + IEMFPURESULT.FSW]
3410 fnclex
3411 fstp tword [A1 + IEMFPURESULT.r80Result]
3412
3413 fninit
3414 add xSP, 20h
3415 EPILOGUE_4_ARGS
3416ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3417%endmacro
3418
3419IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3420IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3421IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3422IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3423IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3424IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3425IEMIMPL_FPU_R80_BY_R80 fprem, {}
3426IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3427IEMIMPL_FPU_R80_BY_R80 fscale, {}
3428
3429
3430;;
3431; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3432; storing the result in ST1 and popping the stack.
3433;
3434; @param 1 The instruction
3435;
3436; @param A0 FPU context (fxsave).
3437; @param A1 Pointer to a IEMFPURESULT for the output.
3438; @param A2 Pointer to the first 80-bit value (ST1).
3439; @param A3 Pointer to the second 80-bit value (ST0).
3440;
3441%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3442BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3443 PROLOGUE_4_ARGS
3444 sub xSP, 20h
3445
3446 fninit
3447 fld tword [A2]
3448 fld tword [A3]
3449 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3450 %1
3451
3452 fnstsw word [A1 + IEMFPURESULT.FSW]
3453 fnclex
3454 fstp tword [A1 + IEMFPURESULT.r80Result]
3455
3456 fninit
3457 add xSP, 20h
3458 EPILOGUE_4_ARGS
3459ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3460%endmacro
3461
3462IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3463IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3464IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3465
3466
3467;;
3468; FPU instruction working on two 80-bit floating point values, only
3469; returning FSW.
3470;
3471; @param 1 The instruction
3472;
3473; @param A0 FPU context (fxsave).
3474; @param A1 Pointer to a uint16_t for the resulting FSW.
3475; @param A2 Pointer to the first 80-bit value.
3476; @param A3 Pointer to the second 80-bit value.
3477;
3478%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3480 PROLOGUE_4_ARGS
3481 sub xSP, 20h
3482
3483 fninit
3484 fld tword [A3]
3485 fld tword [A2]
3486 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3487 %1 st0, st1
3488
3489 fnstsw word [A1]
3490
3491 fninit
3492 add xSP, 20h
3493 EPILOGUE_4_ARGS
3494ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3495%endmacro
3496
3497IEMIMPL_FPU_R80_BY_R80_FSW fcom
3498IEMIMPL_FPU_R80_BY_R80_FSW fucom
3499
3500
3501;;
3502; FPU instruction working on two 80-bit floating point values,
3503; returning FSW and EFLAGS (eax).
3504;
3505; @param 1 The instruction
3506;
3507; @returns EFLAGS in EAX.
3508; @param A0 FPU context (fxsave).
3509; @param A1 Pointer to a uint16_t for the resulting FSW.
3510; @param A2 Pointer to the first 80-bit value.
3511; @param A3 Pointer to the second 80-bit value.
3512;
3513%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3514BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3515 PROLOGUE_4_ARGS
3516 sub xSP, 20h
3517
3518 fninit
3519 fld tword [A3]
3520 fld tword [A2]
3521 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3522 %1 st1
3523
3524 fnstsw word [A1]
3525 pushf
3526 pop xAX
3527
3528 fninit
3529 add xSP, 20h
3530 EPILOGUE_4_ARGS
3531ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3532%endmacro
3533
3534IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3535IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3536
3537
3538;;
3539; FPU instruction working on one 80-bit floating point value.
3540;
3541; @param 1 The instruction
3542;
3543; @param A0 FPU context (fxsave).
3544; @param A1 Pointer to a IEMFPURESULT for the output.
3545; @param A2 Pointer to the 80-bit value.
3546;
3547%macro IEMIMPL_FPU_R80 1
3548BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3549 PROLOGUE_3_ARGS
3550 sub xSP, 20h
3551
3552 fninit
3553 fld tword [A2]
3554 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3555 %1
3556
3557 fnstsw word [A1 + IEMFPURESULT.FSW]
3558 fnclex
3559 fstp tword [A1 + IEMFPURESULT.r80Result]
3560
3561 fninit
3562 add xSP, 20h
3563 EPILOGUE_3_ARGS
3564ENDPROC iemAImpl_ %+ %1 %+ _r80
3565%endmacro
3566
3567IEMIMPL_FPU_R80 fchs
3568IEMIMPL_FPU_R80 fabs
3569IEMIMPL_FPU_R80 f2xm1
3570IEMIMPL_FPU_R80 fsqrt
3571IEMIMPL_FPU_R80 frndint
3572IEMIMPL_FPU_R80 fsin
3573IEMIMPL_FPU_R80 fcos
3574
3575
3576;;
3577; FPU instruction working on one 80-bit floating point value, only
3578; returning FSW.
3579;
3580; @param 1 The instruction
3581; @param 2 Non-zero to also restore FTW.
3582;
3583; @param A0 FPU context (fxsave).
3584; @param A1 Pointer to a uint16_t for the resulting FSW.
3585; @param A2 Pointer to the 80-bit value.
3586;
3587%macro IEMIMPL_FPU_R80_FSW 2
3588BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3589 PROLOGUE_3_ARGS
3590 sub xSP, 20h
3591
3592 fninit
3593 fld tword [A2]
3594%if %2 != 0
3595 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3596%else
3597 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3598%endif
3599 %1
3600
3601 fnstsw word [A1]
3602
3603 fninit
3604 add xSP, 20h
3605 EPILOGUE_3_ARGS
3606ENDPROC iemAImpl_ %+ %1 %+ _r80
3607%endmacro
3608
3609IEMIMPL_FPU_R80_FSW ftst, 0
3610IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3611
3612
3613
3614;;
3615; FPU instruction loading a 80-bit floating point constant.
3616;
3617; @param 1 The instruction
3618;
3619; @param A0 FPU context (fxsave).
3620; @param A1 Pointer to a IEMFPURESULT for the output.
3621;
3622%macro IEMIMPL_FPU_R80_CONST 1
3623BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3624 PROLOGUE_2_ARGS
3625 sub xSP, 20h
3626
3627 fninit
3628 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3629 %1
3630
3631 fnstsw word [A1 + IEMFPURESULT.FSW]
3632 fnclex
3633 fstp tword [A1 + IEMFPURESULT.r80Result]
3634
3635 fninit
3636 add xSP, 20h
3637 EPILOGUE_2_ARGS
3638ENDPROC iemAImpl_ %+ %1 %+
3639%endmacro
3640
3641IEMIMPL_FPU_R80_CONST fld1
3642IEMIMPL_FPU_R80_CONST fldl2t
3643IEMIMPL_FPU_R80_CONST fldl2e
3644IEMIMPL_FPU_R80_CONST fldpi
3645IEMIMPL_FPU_R80_CONST fldlg2
3646IEMIMPL_FPU_R80_CONST fldln2
3647IEMIMPL_FPU_R80_CONST fldz
3648
3649
3650;;
3651; FPU instruction working on one 80-bit floating point value, outputing two.
3652;
3653; @param 1 The instruction
3654;
3655; @param A0 FPU context (fxsave).
3656; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3657; @param A2 Pointer to the 80-bit value.
3658;
3659%macro IEMIMPL_FPU_R80_R80 1
3660BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3661 PROLOGUE_3_ARGS
3662 sub xSP, 20h
3663
3664 fninit
3665 fld tword [A2]
3666 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3667 %1
3668
3669 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3670 fnclex
3671 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3672 fnclex
3673 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3674
3675 fninit
3676 add xSP, 20h
3677 EPILOGUE_3_ARGS
3678ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3679%endmacro
3680
3681IEMIMPL_FPU_R80_R80 fptan
3682IEMIMPL_FPU_R80_R80 fxtract
3683IEMIMPL_FPU_R80_R80 fsincos
3684
3685
3686
3687
3688;---------------------- SSE and MMX Operations ----------------------
3689
3690;; @todo what do we need to do for MMX?
3691%macro IEMIMPL_MMX_PROLOGUE 0
3692%endmacro
3693%macro IEMIMPL_MMX_EPILOGUE 0
3694%endmacro
3695
3696;; @todo what do we need to do for SSE?
3697%macro IEMIMPL_SSE_PROLOGUE 0
3698%endmacro
3699%macro IEMIMPL_SSE_EPILOGUE 0
3700%endmacro
3701
3702;; @todo what do we need to do for AVX?
3703%macro IEMIMPL_AVX_PROLOGUE 0
3704%endmacro
3705%macro IEMIMPL_AVX_EPILOGUE 0
3706%endmacro
3707
3708
3709;;
3710; Media instruction working on two full sized registers.
3711;
3712; @param 1 The instruction
3713; @param 2 Whether there is an MMX variant (1) or not (0).
3714;
3715; @param A0 FPU context (fxsave).
3716; @param A1 Pointer to the first media register size operand (input/output).
3717; @param A2 Pointer to the second media register size operand (input).
3718;
3719%macro IEMIMPL_MEDIA_F2 2
3720%if %2 != 0
3721BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3722 PROLOGUE_3_ARGS
3723 IEMIMPL_MMX_PROLOGUE
3724
3725 movq mm0, [A1]
3726 movq mm1, [A2]
3727 %1 mm0, mm1
3728 movq [A1], mm0
3729
3730 IEMIMPL_MMX_EPILOGUE
3731 EPILOGUE_3_ARGS
3732ENDPROC iemAImpl_ %+ %1 %+ _u64
3733%endif
3734
3735BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3736 PROLOGUE_3_ARGS
3737 IEMIMPL_SSE_PROLOGUE
3738
3739 movdqu xmm0, [A1]
3740 movdqu xmm1, [A2]
3741 %1 xmm0, xmm1
3742 movdqu [A1], xmm0
3743
3744 IEMIMPL_SSE_EPILOGUE
3745 EPILOGUE_3_ARGS
3746ENDPROC iemAImpl_ %+ %1 %+ _u128
3747%endmacro
3748
3749IEMIMPL_MEDIA_F2 pshufb, 1
3750IEMIMPL_MEDIA_F2 pand, 1
3751IEMIMPL_MEDIA_F2 pandn, 1
3752IEMIMPL_MEDIA_F2 por, 1
3753IEMIMPL_MEDIA_F2 pxor, 1
3754IEMIMPL_MEDIA_F2 pcmpeqb, 1
3755IEMIMPL_MEDIA_F2 pcmpeqw, 1
3756IEMIMPL_MEDIA_F2 pcmpeqd, 1
3757IEMIMPL_MEDIA_F2 pcmpeqq, 0
3758IEMIMPL_MEDIA_F2 pcmpgtb, 1
3759IEMIMPL_MEDIA_F2 pcmpgtw, 1
3760IEMIMPL_MEDIA_F2 pcmpgtd, 1
3761IEMIMPL_MEDIA_F2 pcmpgtq, 0
3762IEMIMPL_MEDIA_F2 paddb, 1
3763IEMIMPL_MEDIA_F2 paddw, 1
3764IEMIMPL_MEDIA_F2 paddd, 1
3765IEMIMPL_MEDIA_F2 paddq, 1
3766IEMIMPL_MEDIA_F2 paddsb, 1
3767IEMIMPL_MEDIA_F2 paddsw, 1
3768IEMIMPL_MEDIA_F2 paddusb, 1
3769IEMIMPL_MEDIA_F2 paddusw, 1
3770IEMIMPL_MEDIA_F2 psubb, 1
3771IEMIMPL_MEDIA_F2 psubw, 1
3772IEMIMPL_MEDIA_F2 psubd, 1
3773IEMIMPL_MEDIA_F2 psubq, 1
3774IEMIMPL_MEDIA_F2 psubsb, 1
3775IEMIMPL_MEDIA_F2 psubsw, 1
3776IEMIMPL_MEDIA_F2 psubusb, 1
3777IEMIMPL_MEDIA_F2 psubusw, 1
3778IEMIMPL_MEDIA_F2 pmullw, 1
3779IEMIMPL_MEDIA_F2 pmulld, 0
3780IEMIMPL_MEDIA_F2 pmulhw, 1
3781IEMIMPL_MEDIA_F2 pmaddwd, 1
3782IEMIMPL_MEDIA_F2 pminub, 1
3783IEMIMPL_MEDIA_F2 pminuw, 0
3784IEMIMPL_MEDIA_F2 pminud, 0
3785IEMIMPL_MEDIA_F2 pminsb, 0
3786IEMIMPL_MEDIA_F2 pminsw, 1
3787IEMIMPL_MEDIA_F2 pminsd, 0
3788IEMIMPL_MEDIA_F2 pmaxub, 1
3789IEMIMPL_MEDIA_F2 pmaxuw, 0
3790IEMIMPL_MEDIA_F2 pmaxud, 0
3791IEMIMPL_MEDIA_F2 pmaxsb, 0
3792IEMIMPL_MEDIA_F2 pmaxsw, 1
3793IEMIMPL_MEDIA_F2 pmaxsd, 0
3794IEMIMPL_MEDIA_F2 pabsb, 1
3795IEMIMPL_MEDIA_F2 pabsw, 1
3796IEMIMPL_MEDIA_F2 pabsd, 1
3797IEMIMPL_MEDIA_F2 psignb, 1
3798IEMIMPL_MEDIA_F2 psignw, 1
3799IEMIMPL_MEDIA_F2 psignd, 1
3800IEMIMPL_MEDIA_F2 phaddw, 1
3801IEMIMPL_MEDIA_F2 phaddd, 1
3802IEMIMPL_MEDIA_F2 phsubw, 1
3803IEMIMPL_MEDIA_F2 phsubd, 1
3804IEMIMPL_MEDIA_F2 phaddsw, 1
3805IEMIMPL_MEDIA_F2 phsubsw, 1
3806IEMIMPL_MEDIA_F2 pmaddubsw, 1
3807IEMIMPL_MEDIA_F2 pmulhrsw, 1
3808IEMIMPL_MEDIA_F2 pmuludq, 1
3809
3810
3811;;
3812; Media instruction working on two full sized registers, but no FXSAVE state argument.
3813;
3814; @param 1 The instruction
3815; @param 2 Whether there is an MMX variant (1) or not (0).
3816;
3817; @param A0 Pointer to the first media register size operand (input/output).
3818; @param A1 Pointer to the second media register size operand (input).
3819;
3820%macro IEMIMPL_MEDIA_OPT_F2 2
3821%if %2 != 0
3822BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3823 PROLOGUE_2_ARGS
3824 IEMIMPL_MMX_PROLOGUE
3825
3826 movq mm0, [A0]
3827 movq mm1, [A1]
3828 %1 mm0, mm1
3829 movq [A0], mm0
3830
3831 IEMIMPL_MMX_EPILOGUE
3832 EPILOGUE_2_ARGS
3833ENDPROC iemAImpl_ %+ %1 %+ _u64
3834%endif
3835
3836BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3837 PROLOGUE_2_ARGS
3838 IEMIMPL_SSE_PROLOGUE
3839
3840 movdqu xmm0, [A0]
3841 movdqu xmm1, [A1]
3842 %1 xmm0, xmm1
3843 movdqu [A0], xmm0
3844
3845 IEMIMPL_SSE_EPILOGUE
3846 EPILOGUE_2_ARGS
3847ENDPROC iemAImpl_ %+ %1 %+ _u128
3848%endmacro
3849
3850IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3851IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3852IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3853IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3854IEMIMPL_MEDIA_OPT_F2 psllw, 1
3855IEMIMPL_MEDIA_OPT_F2 pslld, 1
3856IEMIMPL_MEDIA_OPT_F2 psllq, 1
3857IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3858IEMIMPL_MEDIA_OPT_F2 psrld, 1
3859IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3860IEMIMPL_MEDIA_OPT_F2 psraw, 1
3861IEMIMPL_MEDIA_OPT_F2 psrad, 1
3862IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3863IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3864IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3865IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3866IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3867IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3868IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3869IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3870IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3871IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3872IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3873IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3874IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3875IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3876IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3877IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3878IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3879IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3880IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3881IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3882
3883;;
3884; Media instruction working on one full sized and one half sized register (lower half).
3885;
3886; @param 1 The instruction
3887; @param 2 1 if MMX is included, 0 if not.
3888;
3889; @param A0 Pointer to the first full sized media register operand (input/output).
3890; @param A1 Pointer to the second half sized media register operand (input).
3891;
3892%macro IEMIMPL_MEDIA_F1L1 2
3893 %if %2 != 0
3894BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3895 PROLOGUE_2_ARGS
3896 IEMIMPL_MMX_PROLOGUE
3897
3898 movq mm0, [A0]
3899 movq mm1, [A1]
3900 %1 mm0, mm1
3901 movq [A0], mm0
3902
3903 IEMIMPL_MMX_EPILOGUE
3904 EPILOGUE_2_ARGS
3905ENDPROC iemAImpl_ %+ %1 %+ _u64
3906 %endif
3907
3908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3909 PROLOGUE_2_ARGS
3910 IEMIMPL_SSE_PROLOGUE
3911
3912 movdqu xmm0, [A0]
3913 movdqu xmm1, [A1]
3914 %1 xmm0, xmm1
3915 movdqu [A0], xmm0
3916
3917 IEMIMPL_SSE_EPILOGUE
3918 EPILOGUE_2_ARGS
3919ENDPROC iemAImpl_ %+ %1 %+ _u128
3920%endmacro
3921
3922IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3923IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3924IEMIMPL_MEDIA_F1L1 punpckldq, 1
3925IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3926
3927
3928;;
3929; Media instruction working two half sized input registers (lower half) and a full sized
3930; destination register (vpunpckh*).
3931;
3932; @param 1 The instruction
3933;
3934; @param A0 Pointer to the destination register (full sized, output only).
3935; @param A1 Pointer to the first full sized media source register operand, where we
3936; will only use the lower half as input - but we'll be loading it in full.
3937; @param A2 Pointer to the second full sized media source register operand, where we
3938; will only use the lower half as input - but we'll be loading it in full.
3939;
3940%macro IEMIMPL_MEDIA_F1L1L1 1
3941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3942 PROLOGUE_3_ARGS
3943 IEMIMPL_AVX_PROLOGUE
3944
3945 vmovdqu xmm0, [A1]
3946 vmovdqu xmm1, [A2]
3947 %1 xmm0, xmm0, xmm1
3948 vmovdqu [A0], xmm0
3949
3950 IEMIMPL_AVX_PROLOGUE
3951 EPILOGUE_3_ARGS
3952ENDPROC iemAImpl_ %+ %1 %+ _u128
3953
3954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3955 PROLOGUE_3_ARGS
3956 IEMIMPL_AVX_PROLOGUE
3957
3958 vmovdqu ymm0, [A1]
3959 vmovdqu ymm1, [A2]
3960 %1 ymm0, ymm0, ymm1
3961 vmovdqu [A0], ymm0
3962
3963 IEMIMPL_AVX_PROLOGUE
3964 EPILOGUE_3_ARGS
3965ENDPROC iemAImpl_ %+ %1 %+ _u256
3966%endmacro
3967
3968IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3969IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3970IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3971IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3972
3973
3974;;
3975; Media instruction working on one full sized and one half sized register (high half).
3976;
3977; @param 1 The instruction
3978; @param 2 1 if MMX is included, 0 if not.
3979;
3980; @param A0 Pointer to the first full sized media register operand (input/output).
3981; @param A1 Pointer to the second full sized media register operand, where we
3982; will only use the upper half as input - but we'll load it in full.
3983;
3984%macro IEMIMPL_MEDIA_F1H1 2
3985IEMIMPL_MEDIA_F1L1 %1, %2
3986%endmacro
3987
3988IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3989IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3990IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3991IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3992
3993
3994;;
3995; Media instruction working two half sized input registers (high half) and a full sized
3996; destination register (vpunpckh*).
3997;
3998; @param 1 The instruction
3999;
4000; @param A0 Pointer to the destination register (full sized, output only).
4001; @param A1 Pointer to the first full sized media source register operand, where we
4002; will only use the upper half as input - but we'll be loading it in full.
4003; @param A2 Pointer to the second full sized media source register operand, where we
4004; will only use the upper half as input - but we'll be loading it in full.
4005;
4006%macro IEMIMPL_MEDIA_F1H1H1 1
4007IEMIMPL_MEDIA_F1L1L1 %1
4008%endmacro
4009
4010IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4011IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4012IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4013IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4014
4015
4016;
4017; Shufflers with evil 8-bit immediates.
4018;
4019
4020BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4021 PROLOGUE_3_ARGS
4022 IEMIMPL_MMX_PROLOGUE
4023
4024 movzx A2, A2_8 ; must clear top bits
4025 movq mm1, [A1]
4026 movq mm0, mm0 ; paranoia!
4027 lea T1, [.imm0 xWrtRIP]
4028 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4029 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4030 %else
4031 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4032 %endif
4033 lea T1, [T1 + T0]
4034 IBT_NOTRACK
4035 call T1
4036 movq [A0], mm0
4037
4038 IEMIMPL_MMX_EPILOGUE
4039 EPILOGUE_3_ARGS
4040%assign bImm 0
4041%rep 256
4042.imm %+ bImm:
4043 IBT_ENDBRxx_WITHOUT_NOTRACK
4044 pshufw mm0, mm1, bImm
4045 ret
4046 %assign bImm bImm + 1
4047%endrep
4048.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4049ENDPROC iemAImpl_pshufw_u64
4050
4051
4052%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4053BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4054 PROLOGUE_3_ARGS
4055 IEMIMPL_SSE_PROLOGUE
4056
4057 movzx A2, A2_8 ; must clear top bits
4058 movdqu xmm1, [A1]
4059 movdqu xmm0, xmm1 ; paranoia!
4060 lea T1, [.imm0 xWrtRIP]
4061 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4062 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4063 %else
4064 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4065 %endif
4066 lea T1, [T1 + T0*2]
4067 IBT_NOTRACK
4068 call T1
4069 movdqu [A0], xmm0
4070
4071 IEMIMPL_SSE_EPILOGUE
4072 EPILOGUE_3_ARGS
4073
4074 %assign bImm 0
4075 %rep 256
4076.imm %+ bImm:
4077 IBT_ENDBRxx_WITHOUT_NOTRACK
4078 %1 xmm0, xmm1, bImm
4079 ret
4080 %assign bImm bImm + 1
4081 %endrep
4082.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4083ENDPROC iemAImpl_ %+ %1 %+ _u128
4084%endmacro
4085
4086IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4087IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4088IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4089
4090
4091%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4092BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4093 PROLOGUE_3_ARGS
4094 IEMIMPL_SSE_PROLOGUE
4095
4096 movzx A2, A2_8 ; must clear top bits
4097 vmovdqu ymm1, [A1]
4098 vmovdqu ymm0, ymm1 ; paranoia!
4099 lea T1, [.imm0 xWrtRIP]
4100 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4101 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4102 %else
4103 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4104 %endif
4105 lea T1, [T1 + T0*2]
4106 IBT_NOTRACK
4107 call T1
4108 vmovdqu [A0], ymm0
4109
4110 IEMIMPL_SSE_EPILOGUE
4111 EPILOGUE_3_ARGS
4112 %assign bImm 0
4113 %rep 256
4114.imm %+ bImm:
4115 IBT_ENDBRxx_WITHOUT_NOTRACK
4116 %1 ymm0, ymm1, bImm
4117 ret
4118 %assign bImm bImm + 1
4119 %endrep
4120.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4121ENDPROC iemAImpl_ %+ %1 %+ _u256
4122%endmacro
4123
4124IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4125IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4126IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4127
4128
4129;
4130; Shifts with evil 8-bit immediates.
4131;
4132
4133%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4134BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4135 PROLOGUE_2_ARGS
4136 IEMIMPL_MMX_PROLOGUE
4137
4138 movzx A1, A1_8 ; must clear top bits
4139 movq mm0, [A0]
4140 lea T1, [.imm0 xWrtRIP]
4141 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4142 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4143 %else
4144 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4145 %endif
4146 lea T1, [T1 + T0]
4147 IBT_NOTRACK
4148 call T1
4149 movq [A0], mm0
4150
4151 IEMIMPL_MMX_EPILOGUE
4152 EPILOGUE_2_ARGS
4153%assign bImm 0
4154%rep 256
4155.imm %+ bImm:
4156 IBT_ENDBRxx_WITHOUT_NOTRACK
4157 %1 mm0, bImm
4158 ret
4159 %assign bImm bImm + 1
4160%endrep
4161.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4162ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4163%endmacro
4164
4165IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4166IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4167IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4168IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4169IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4170IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4171IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4172IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4173
4174
4175%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4176BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4177 PROLOGUE_2_ARGS
4178 IEMIMPL_SSE_PROLOGUE
4179
4180 movzx A1, A1_8 ; must clear top bits
4181 movdqu xmm0, [A0]
4182 lea T1, [.imm0 xWrtRIP]
4183 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4184 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4185 %else
4186 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4187 %endif
4188 lea T1, [T1 + T0*2]
4189 IBT_NOTRACK
4190 call T1
4191 movdqu [A0], xmm0
4192
4193 IEMIMPL_SSE_EPILOGUE
4194 EPILOGUE_2_ARGS
4195 %assign bImm 0
4196 %rep 256
4197.imm %+ bImm:
4198 IBT_ENDBRxx_WITHOUT_NOTRACK
4199 %1 xmm0, bImm
4200 ret
4201 %assign bImm bImm + 1
4202 %endrep
4203.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4204ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4205%endmacro
4206
4207IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4208IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4209IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4210IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4211IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4212IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4213IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4214IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4215IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4216IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4217
4218
4219;
4220; Move byte mask.
4221;
4222
4223BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4224 PROLOGUE_2_ARGS
4225 IEMIMPL_MMX_PROLOGUE
4226
4227 movq mm1, [A1]
4228 pmovmskb T0, mm1
4229 mov [A0], T0
4230%ifdef RT_ARCH_X86
4231 mov dword [A0 + 4], 0
4232%endif
4233 IEMIMPL_MMX_EPILOGUE
4234 EPILOGUE_2_ARGS
4235ENDPROC iemAImpl_pmovmskb_u64
4236
4237BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4238 PROLOGUE_2_ARGS
4239 IEMIMPL_SSE_PROLOGUE
4240
4241 movdqu xmm1, [A1]
4242 pmovmskb T0, xmm1
4243 mov [A0], T0
4244%ifdef RT_ARCH_X86
4245 mov dword [A0 + 4], 0
4246%endif
4247 IEMIMPL_SSE_EPILOGUE
4248 EPILOGUE_2_ARGS
4249ENDPROC iemAImpl_pmovmskb_u128
4250
4251BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4252 PROLOGUE_2_ARGS
4253 IEMIMPL_AVX_PROLOGUE
4254
4255 vmovdqu ymm1, [A1]
4256 vpmovmskb T0, ymm1
4257 mov [A0], T0
4258%ifdef RT_ARCH_X86
4259 mov dword [A0 + 4], 0
4260%endif
4261 IEMIMPL_AVX_EPILOGUE
4262 EPILOGUE_2_ARGS
4263ENDPROC iemAImpl_vpmovmskb_u256
4264
4265
4266;;
4267; Media instruction working on two full sized source registers and one destination (AVX).
4268;
4269; @param 1 The instruction
4270;
4271; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4272; @param A1 Pointer to the destination media register size operand (output).
4273; @param A2 Pointer to the first source media register size operand (input).
4274; @param A3 Pointer to the second source media register size operand (input).
4275;
4276%macro IEMIMPL_MEDIA_F3 1
4277BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4278 PROLOGUE_4_ARGS
4279 IEMIMPL_AVX_PROLOGUE
4280
4281 vmovdqu xmm0, [A2]
4282 vmovdqu xmm1, [A3]
4283 %1 xmm0, xmm0, xmm1
4284 vmovdqu [A1], xmm0
4285
4286 IEMIMPL_AVX_PROLOGUE
4287 EPILOGUE_4_ARGS
4288ENDPROC iemAImpl_ %+ %1 %+ _u128
4289
4290BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4291 PROLOGUE_4_ARGS
4292 IEMIMPL_AVX_PROLOGUE
4293
4294 vmovdqu ymm0, [A2]
4295 vmovdqu ymm1, [A3]
4296 %1 ymm0, ymm0, ymm1
4297 vmovdqu [A1], ymm0
4298
4299 IEMIMPL_AVX_PROLOGUE
4300 EPILOGUE_4_ARGS
4301ENDPROC iemAImpl_ %+ %1 %+ _u256
4302%endmacro
4303
4304IEMIMPL_MEDIA_F3 vpshufb
4305IEMIMPL_MEDIA_F3 vpand
4306IEMIMPL_MEDIA_F3 vpminub
4307IEMIMPL_MEDIA_F3 vpminuw
4308IEMIMPL_MEDIA_F3 vpminud
4309IEMIMPL_MEDIA_F3 vpminsb
4310IEMIMPL_MEDIA_F3 vpminsw
4311IEMIMPL_MEDIA_F3 vpminsd
4312IEMIMPL_MEDIA_F3 vpmaxub
4313IEMIMPL_MEDIA_F3 vpmaxuw
4314IEMIMPL_MEDIA_F3 vpmaxud
4315IEMIMPL_MEDIA_F3 vpmaxsb
4316IEMIMPL_MEDIA_F3 vpmaxsw
4317IEMIMPL_MEDIA_F3 vpmaxsd
4318IEMIMPL_MEDIA_F3 vpandn
4319IEMIMPL_MEDIA_F3 vpor
4320IEMIMPL_MEDIA_F3 vpxor
4321IEMIMPL_MEDIA_F3 vpcmpeqb
4322IEMIMPL_MEDIA_F3 vpcmpeqw
4323IEMIMPL_MEDIA_F3 vpcmpeqd
4324IEMIMPL_MEDIA_F3 vpcmpeqq
4325IEMIMPL_MEDIA_F3 vpcmpgtb
4326IEMIMPL_MEDIA_F3 vpcmpgtw
4327IEMIMPL_MEDIA_F3 vpcmpgtd
4328IEMIMPL_MEDIA_F3 vpcmpgtq
4329IEMIMPL_MEDIA_F3 vpaddb
4330IEMIMPL_MEDIA_F3 vpaddw
4331IEMIMPL_MEDIA_F3 vpaddd
4332IEMIMPL_MEDIA_F3 vpaddq
4333IEMIMPL_MEDIA_F3 vpsubb
4334IEMIMPL_MEDIA_F3 vpsubw
4335IEMIMPL_MEDIA_F3 vpsubd
4336IEMIMPL_MEDIA_F3 vpsubq
4337
4338
4339;;
4340; Media instruction working on two full sized source registers and one destination (AVX),
4341; but no XSAVE state pointer argument.
4342;
4343; @param 1 The instruction
4344;
4345; @param A0 Pointer to the destination media register size operand (output).
4346; @param A1 Pointer to the first source media register size operand (input).
4347; @param A2 Pointer to the second source media register size operand (input).
4348;
4349%macro IEMIMPL_MEDIA_OPT_F3 1
4350BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4351 PROLOGUE_3_ARGS
4352 IEMIMPL_AVX_PROLOGUE
4353
4354 vmovdqu xmm0, [A1]
4355 vmovdqu xmm1, [A2]
4356 %1 xmm0, xmm0, xmm1
4357 vmovdqu [A0], xmm0
4358
4359 IEMIMPL_AVX_PROLOGUE
4360 EPILOGUE_3_ARGS
4361ENDPROC iemAImpl_ %+ %1 %+ _u128
4362
4363BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4364 PROLOGUE_3_ARGS
4365 IEMIMPL_AVX_PROLOGUE
4366
4367 vmovdqu ymm0, [A1]
4368 vmovdqu ymm1, [A2]
4369 %1 ymm0, ymm0, ymm1
4370 vmovdqu [A0], ymm0
4371
4372 IEMIMPL_AVX_PROLOGUE
4373 EPILOGUE_3_ARGS
4374ENDPROC iemAImpl_ %+ %1 %+ _u256
4375%endmacro
4376
4377IEMIMPL_MEDIA_OPT_F3 vpacksswb
4378IEMIMPL_MEDIA_OPT_F3 vpackssdw
4379IEMIMPL_MEDIA_OPT_F3 vpackuswb
4380IEMIMPL_MEDIA_OPT_F3 vpackusdw
4381IEMIMPL_MEDIA_OPT_F3 vpmullw
4382IEMIMPL_MEDIA_OPT_F3 vpmulld
4383IEMIMPL_MEDIA_OPT_F3 vpmulhw
4384IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4385IEMIMPL_MEDIA_OPT_F3 vpavgb
4386IEMIMPL_MEDIA_OPT_F3 vpavgw
4387IEMIMPL_MEDIA_OPT_F3 vpsignb
4388IEMIMPL_MEDIA_OPT_F3 vpsignw
4389IEMIMPL_MEDIA_OPT_F3 vpsignd
4390IEMIMPL_MEDIA_OPT_F3 vphaddw
4391IEMIMPL_MEDIA_OPT_F3 vphaddd
4392IEMIMPL_MEDIA_OPT_F3 vphsubw
4393IEMIMPL_MEDIA_OPT_F3 vphsubd
4394IEMIMPL_MEDIA_OPT_F3 vphaddsw
4395IEMIMPL_MEDIA_OPT_F3 vphsubsw
4396IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4397IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4398IEMIMPL_MEDIA_OPT_F3 vpsadbw
4399IEMIMPL_MEDIA_OPT_F3 vpmuldq
4400IEMIMPL_MEDIA_OPT_F3 vpmuludq
4401IEMIMPL_MEDIA_OPT_F3 vunpcklps
4402IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4403IEMIMPL_MEDIA_OPT_F3 vunpckhps
4404IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4405IEMIMPL_MEDIA_OPT_F3 vpsubsb
4406IEMIMPL_MEDIA_OPT_F3 vpsubsw
4407IEMIMPL_MEDIA_OPT_F3 vpsubusb
4408IEMIMPL_MEDIA_OPT_F3 vpsubusw
4409IEMIMPL_MEDIA_OPT_F3 vpaddusb
4410IEMIMPL_MEDIA_OPT_F3 vpaddusw
4411IEMIMPL_MEDIA_OPT_F3 vpaddsb
4412IEMIMPL_MEDIA_OPT_F3 vpaddsw
4413
4414
4415;;
4416; Media instruction working on one full sized source registers and one destination (AVX),
4417; but no XSAVE state pointer argument.
4418;
4419; @param 1 The instruction
4420; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4421;
4422; @param A0 Pointer to the destination media register size operand (output).
4423; @param A1 Pointer to the source media register size operand (input).
4424;
4425%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4426BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4427 PROLOGUE_2_ARGS
4428 IEMIMPL_AVX_PROLOGUE
4429
4430 vmovdqu xmm0, [A1]
4431 %1 xmm0, xmm0
4432 vmovdqu [A0], xmm0
4433
4434 IEMIMPL_AVX_PROLOGUE
4435 EPILOGUE_2_ARGS
4436ENDPROC iemAImpl_ %+ %1 %+ _u128
4437
4438 %if %2 == 1
4439BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4440 PROLOGUE_2_ARGS
4441 IEMIMPL_AVX_PROLOGUE
4442
4443 vmovdqu ymm0, [A1]
4444 %1 ymm0, ymm0
4445 vmovdqu [A0], ymm0
4446
4447 IEMIMPL_AVX_PROLOGUE
4448 EPILOGUE_2_ARGS
4449ENDPROC iemAImpl_ %+ %1 %+ _u256
4450 %endif
4451%endmacro
4452
4453IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4454IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4455IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4456IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4457
4458
4459;
4460; The SSE 4.2 crc32
4461;
4462; @param A1 Pointer to the 32-bit destination.
4463; @param A2 The source operand, sized according to the suffix.
4464;
4465BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4466 PROLOGUE_2_ARGS
4467
4468 mov T0_32, [A0]
4469 crc32 T0_32, A1_8
4470 mov [A0], T0_32
4471
4472 EPILOGUE_2_ARGS
4473ENDPROC iemAImpl_crc32_u8
4474
4475BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4476 PROLOGUE_2_ARGS
4477
4478 mov T0_32, [A0]
4479 crc32 T0_32, A1_16
4480 mov [A0], T0_32
4481
4482 EPILOGUE_2_ARGS
4483ENDPROC iemAImpl_crc32_u16
4484
4485BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4486 PROLOGUE_2_ARGS
4487
4488 mov T0_32, [A0]
4489 crc32 T0_32, A1_32
4490 mov [A0], T0_32
4491
4492 EPILOGUE_2_ARGS
4493ENDPROC iemAImpl_crc32_u32
4494
4495%ifdef RT_ARCH_AMD64
4496BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4497 PROLOGUE_2_ARGS
4498
4499 mov T0_32, [A0]
4500 crc32 T0, A1
4501 mov [A0], T0_32
4502
4503 EPILOGUE_2_ARGS
4504ENDPROC iemAImpl_crc32_u64
4505%endif
4506
4507
4508;
4509; PTEST (SSE 4.1)
4510;
4511; @param A0 Pointer to the first source operand (aka readonly destination).
4512; @param A1 Pointer to the second source operand.
4513; @param A2 Pointer to the EFLAGS register.
4514;
4515BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4516 PROLOGUE_3_ARGS
4517 IEMIMPL_SSE_PROLOGUE
4518
4519 movdqu xmm0, [A0]
4520 movdqu xmm1, [A1]
4521 ptest xmm0, xmm1
4522 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4523
4524 IEMIMPL_SSE_EPILOGUE
4525 EPILOGUE_3_ARGS
4526ENDPROC iemAImpl_ptest_u128
4527
4528BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4529 PROLOGUE_3_ARGS
4530 IEMIMPL_SSE_PROLOGUE
4531
4532 vmovdqu ymm0, [A0]
4533 vmovdqu ymm1, [A1]
4534 vptest ymm0, ymm1
4535 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4536
4537 IEMIMPL_SSE_EPILOGUE
4538 EPILOGUE_3_ARGS
4539ENDPROC iemAImpl_vptest_u256
4540
4541
4542;;
4543; Template for the [v]pmov{s,z}x* instructions
4544;
4545; @param 1 The instruction
4546;
4547; @param A0 Pointer to the destination media register size operand (output).
4548; @param A1 The source operand value (input).
4549;
4550%macro IEMIMPL_V_PMOV_SZ_X 1
4551BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4552 PROLOGUE_2_ARGS
4553 IEMIMPL_SSE_PROLOGUE
4554
4555 movd xmm0, A1
4556 %1 xmm0, xmm0
4557 vmovdqu [A0], xmm0
4558
4559 IEMIMPL_SSE_PROLOGUE
4560 EPILOGUE_2_ARGS
4561ENDPROC iemAImpl_ %+ %1 %+ _u128
4562
4563BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4564 PROLOGUE_2_ARGS
4565 IEMIMPL_AVX_PROLOGUE
4566
4567 movd xmm0, A1
4568 v %+ %1 xmm0, xmm0
4569 vmovdqu [A0], xmm0
4570
4571 IEMIMPL_AVX_PROLOGUE
4572 EPILOGUE_2_ARGS
4573ENDPROC iemAImpl_v %+ %1 %+ _u128
4574
4575BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4576 PROLOGUE_2_ARGS
4577 IEMIMPL_AVX_PROLOGUE
4578
4579 movdqu xmm0, [A1]
4580 v %+ %1 ymm0, xmm0
4581 vmovdqu [A0], ymm0
4582
4583 IEMIMPL_AVX_PROLOGUE
4584 EPILOGUE_2_ARGS
4585ENDPROC iemAImpl_v %+ %1 %+ _u256
4586%endmacro
4587
4588IEMIMPL_V_PMOV_SZ_X pmovsxbw
4589IEMIMPL_V_PMOV_SZ_X pmovsxbd
4590IEMIMPL_V_PMOV_SZ_X pmovsxbq
4591IEMIMPL_V_PMOV_SZ_X pmovsxwd
4592IEMIMPL_V_PMOV_SZ_X pmovsxwq
4593IEMIMPL_V_PMOV_SZ_X pmovsxdq
4594
4595IEMIMPL_V_PMOV_SZ_X pmovzxbw
4596IEMIMPL_V_PMOV_SZ_X pmovzxbd
4597IEMIMPL_V_PMOV_SZ_X pmovzxbq
4598IEMIMPL_V_PMOV_SZ_X pmovzxwd
4599IEMIMPL_V_PMOV_SZ_X pmovzxwq
4600IEMIMPL_V_PMOV_SZ_X pmovzxdq
4601
4602
4603;;
4604; Need to move this as well somewhere better?
4605;
4606struc IEMSSERESULT
4607 .uResult resd 4
4608 .MXCSR resd 1
4609endstruc
4610
4611
4612;;
4613; Need to move this as well somewhere better?
4614;
4615struc IEMAVX128RESULT
4616 .uResult resd 4
4617 .MXCSR resd 1
4618endstruc
4619
4620
4621;;
4622; Need to move this as well somewhere better?
4623;
4624struc IEMAVX256RESULT
4625 .uResult resd 8
4626 .MXCSR resd 1
4627endstruc
4628
4629
4630;;
4631; Initialize the SSE MXCSR register using the guest value partially to
4632; account for rounding mode.
4633;
4634; @uses 4 bytes of stack to save the original value, T0.
4635; @param 1 Expression giving the address of the FXSTATE of the guest.
4636;
4637%macro SSE_LD_FXSTATE_MXCSR 1
4638 sub xSP, 4
4639
4640 stmxcsr [xSP]
4641 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4642 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4643 or T0_32, X86_MXCSR_XCPT_MASK
4644 sub xSP, 4
4645 mov [xSP], T0_32
4646 ldmxcsr [xSP]
4647 add xSP, 4
4648%endmacro
4649
4650
4651;;
4652; Restores the SSE MXCSR register with the original value.
4653;
4654; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4655; @param 1 Expression giving the address where to return the MXCSR value.
4656; @param 2 Expression giving the address of the FXSTATE of the guest.
4657;
4658; @note Restores the stack pointer.
4659;
4660%macro SSE_ST_FXSTATE_MXCSR 2
4661 sub xSP, 4
4662 stmxcsr [xSP]
4663 mov T0_32, [xSP]
4664 add xSP, 4
4665 ; Merge the status bits into the original MXCSR value.
4666 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4667 and T0_32, X86_MXCSR_XCPT_FLAGS
4668 or T0_32, T1_32
4669 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4670
4671 ldmxcsr [xSP]
4672 add xSP, 4
4673%endmacro
4674
4675
4676;;
4677; Initialize the SSE MXCSR register using the guest value partially to
4678; account for rounding mode.
4679;
4680; @uses 4 bytes of stack to save the original value.
4681; @param 1 Expression giving the address of the FXSTATE of the guest.
4682;
4683%macro AVX_LD_XSAVEAREA_MXCSR 1
4684 sub xSP, 4
4685
4686 stmxcsr [xSP]
4687 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4688 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4689 sub xSP, 4
4690 mov [xSP], T0_32
4691 ldmxcsr [xSP]
4692 add xSP, 4
4693%endmacro
4694
4695
4696;;
4697; Restores the AVX128 MXCSR register with the original value.
4698;
4699; @param 1 Expression giving the address where to return the MXCSR value.
4700;
4701; @note Restores the stack pointer.
4702;
4703%macro AVX128_ST_XSAVEAREA_MXCSR 1
4704 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4705
4706 ldmxcsr [xSP]
4707 add xSP, 4
4708%endmacro
4709
4710
4711;;
4712; Restores the AVX256 MXCSR register with the original value.
4713;
4714; @param 1 Expression giving the address where to return the MXCSR value.
4715;
4716; @note Restores the stack pointer.
4717;
4718%macro AVX256_ST_XSAVEAREA_MXCSR 1
4719 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4720
4721 ldmxcsr [xSP]
4722 add xSP, 4
4723%endmacro
4724
4725
4726;;
4727; Floating point instruction working on two full sized registers.
4728;
4729; @param 1 The instruction
4730; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4731;
4732; @param A0 FPU context (FXSTATE or XSAVEAREA).
4733; @param A1 Where to return the result including the MXCSR value.
4734; @param A2 Pointer to the first media register size operand (input/output).
4735; @param A3 Pointer to the second media register size operand (input).
4736;
4737%macro IEMIMPL_FP_F2 2
4738BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4739 PROLOGUE_4_ARGS
4740 IEMIMPL_SSE_PROLOGUE
4741 SSE_LD_FXSTATE_MXCSR A0
4742
4743 movdqu xmm0, [A2]
4744 movdqu xmm1, [A3]
4745 %1 xmm0, xmm1
4746 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4747
4748 SSE_ST_FXSTATE_MXCSR A1, A0
4749 IEMIMPL_SSE_PROLOGUE
4750 EPILOGUE_4_ARGS
4751ENDPROC iemAImpl_ %+ %1 %+ _u128
4752
4753 %if %2 == 3
4754BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4755 PROLOGUE_4_ARGS
4756 IEMIMPL_AVX_PROLOGUE
4757 AVX_LD_XSAVEAREA_MXCSR A0
4758
4759 vmovdqu xmm0, [A2]
4760 vmovdqu xmm1, [A3]
4761 v %+ %1 xmm0, xmm0, xmm1
4762 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4763
4764 AVX128_ST_XSAVEAREA_MXCSR A1
4765 IEMIMPL_AVX_PROLOGUE
4766 EPILOGUE_4_ARGS
4767ENDPROC iemAImpl_v %+ %1 %+ _u128
4768
4769BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4770 PROLOGUE_4_ARGS
4771 IEMIMPL_AVX_PROLOGUE
4772 AVX_LD_XSAVEAREA_MXCSR A0
4773
4774 vmovdqu ymm0, [A2]
4775 vmovdqu ymm1, [A3]
4776 v %+ %1 ymm0, ymm0, ymm1
4777 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4778
4779 AVX256_ST_XSAVEAREA_MXCSR A1
4780 IEMIMPL_AVX_PROLOGUE
4781 EPILOGUE_4_ARGS
4782ENDPROC iemAImpl_v %+ %1 %+ _u256
4783 %elif %2 == 2
4784BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4785 PROLOGUE_4_ARGS
4786 IEMIMPL_AVX_PROLOGUE
4787 AVX_LD_XSAVEAREA_MXCSR A0
4788
4789 vmovdqu xmm0, [A2]
4790 vmovdqu xmm1, [A3]
4791 v %+ %1 xmm0, xmm1
4792 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4793
4794 AVX128_ST_XSAVEAREA_MXCSR A1
4795 IEMIMPL_AVX_PROLOGUE
4796 EPILOGUE_4_ARGS
4797ENDPROC iemAImpl_v %+ %1 %+ _u128
4798
4799BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4800 PROLOGUE_4_ARGS
4801 IEMIMPL_AVX_PROLOGUE
4802 AVX_LD_XSAVEAREA_MXCSR A0
4803
4804 vmovdqu ymm0, [A2]
4805 vmovdqu ymm1, [A3]
4806 v %+ %1 ymm0, ymm1
4807 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4808
4809 AVX256_ST_XSAVEAREA_MXCSR A1
4810 IEMIMPL_AVX_PROLOGUE
4811 EPILOGUE_4_ARGS
4812ENDPROC iemAImpl_v %+ %1 %+ _u256
4813 %endif
4814%endmacro
4815
4816IEMIMPL_FP_F2 addps, 3
4817IEMIMPL_FP_F2 addpd, 3
4818IEMIMPL_FP_F2 mulps, 3
4819IEMIMPL_FP_F2 mulpd, 3
4820IEMIMPL_FP_F2 subps, 3
4821IEMIMPL_FP_F2 subpd, 3
4822IEMIMPL_FP_F2 minps, 3
4823IEMIMPL_FP_F2 minpd, 3
4824IEMIMPL_FP_F2 divps, 3
4825IEMIMPL_FP_F2 divpd, 3
4826IEMIMPL_FP_F2 maxps, 3
4827IEMIMPL_FP_F2 maxpd, 3
4828IEMIMPL_FP_F2 haddps, 3
4829IEMIMPL_FP_F2 haddpd, 3
4830IEMIMPL_FP_F2 hsubps, 3
4831IEMIMPL_FP_F2 hsubpd, 3
4832IEMIMPL_FP_F2 addsubps, 3
4833IEMIMPL_FP_F2 addsubpd, 3
4834
4835
4836;;
4837; These are actually unary operations but to keep it simple
4838; we treat them as binary for now, so the output result is
4839; always in sync with the register where the result might get written
4840; to.
4841IEMIMPL_FP_F2 sqrtps, 2
4842IEMIMPL_FP_F2 rsqrtps, 2
4843IEMIMPL_FP_F2 sqrtpd, 2
4844IEMIMPL_FP_F2 rcpps, 2
4845IEMIMPL_FP_F2 cvtdq2ps, 2
4846IEMIMPL_FP_F2 cvtps2dq, 2
4847IEMIMPL_FP_F2 cvttps2dq, 2
4848IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4849IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4850IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4851
4852
4853;;
4854; Floating point instruction working on a full sized register and a single precision operand.
4855;
4856; @param 1 The instruction
4857;
4858; @param A0 FPU context (FXSTATE or XSAVEAREA).
4859; @param A1 Where to return the result including the MXCSR value.
4860; @param A2 Pointer to the first media register size operand (input/output).
4861; @param A3 Pointer to the second single precision floating point value (input).
4862;
4863%macro IEMIMPL_FP_F2_R32 1
4864BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4865 PROLOGUE_4_ARGS
4866 IEMIMPL_SSE_PROLOGUE
4867 SSE_LD_FXSTATE_MXCSR A0
4868
4869 movdqu xmm0, [A2]
4870 movd xmm1, [A3]
4871 %1 xmm0, xmm1
4872 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4873
4874 SSE_ST_FXSTATE_MXCSR A1, A0
4875 IEMIMPL_SSE_EPILOGUE
4876 EPILOGUE_4_ARGS
4877ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4878
4879BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4880 PROLOGUE_4_ARGS
4881 IEMIMPL_AVX_PROLOGUE
4882 AVX_LD_XSAVEAREA_MXCSR A0
4883
4884 vmovdqu xmm0, [A2]
4885 vmovd xmm1, [A3]
4886 v %+ %1 xmm0, xmm0, xmm1
4887 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4888
4889 AVX128_ST_XSAVEAREA_MXCSR A1
4890 IEMIMPL_AVX_PROLOGUE
4891 EPILOGUE_4_ARGS
4892ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4893%endmacro
4894
4895IEMIMPL_FP_F2_R32 addss
4896IEMIMPL_FP_F2_R32 mulss
4897IEMIMPL_FP_F2_R32 subss
4898IEMIMPL_FP_F2_R32 minss
4899IEMIMPL_FP_F2_R32 divss
4900IEMIMPL_FP_F2_R32 maxss
4901IEMIMPL_FP_F2_R32 cvtss2sd
4902IEMIMPL_FP_F2_R32 sqrtss
4903IEMIMPL_FP_F2_R32 rsqrtss
4904IEMIMPL_FP_F2_R32 rcpss
4905
4906
4907;;
4908; Floating point instruction working on a full sized register and a double precision operand.
4909;
4910; @param 1 The instruction
4911;
4912; @param A0 FPU context (FXSTATE or XSAVEAREA).
4913; @param A1 Where to return the result including the MXCSR value.
4914; @param A2 Pointer to the first media register size operand (input/output).
4915; @param A3 Pointer to the second double precision floating point value (input).
4916;
4917%macro IEMIMPL_FP_F2_R64 1
4918BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4919 PROLOGUE_4_ARGS
4920 IEMIMPL_SSE_PROLOGUE
4921 SSE_LD_FXSTATE_MXCSR A0
4922
4923 movdqu xmm0, [A2]
4924 movq xmm1, [A3]
4925 %1 xmm0, xmm1
4926 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4927
4928 SSE_ST_FXSTATE_MXCSR A1, A0
4929 IEMIMPL_SSE_EPILOGUE
4930 EPILOGUE_4_ARGS
4931ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4932
4933BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4934 PROLOGUE_4_ARGS
4935 IEMIMPL_AVX_PROLOGUE
4936 AVX_LD_XSAVEAREA_MXCSR A0
4937
4938 vmovdqu xmm0, [A2]
4939 vmovq xmm1, [A3]
4940 v %+ %1 xmm0, xmm0, xmm1
4941 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4942
4943 AVX128_ST_XSAVEAREA_MXCSR A1
4944 IEMIMPL_AVX_EPILOGUE
4945 EPILOGUE_4_ARGS
4946ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4947%endmacro
4948
4949IEMIMPL_FP_F2_R64 addsd
4950IEMIMPL_FP_F2_R64 mulsd
4951IEMIMPL_FP_F2_R64 subsd
4952IEMIMPL_FP_F2_R64 minsd
4953IEMIMPL_FP_F2_R64 divsd
4954IEMIMPL_FP_F2_R64 maxsd
4955IEMIMPL_FP_F2_R64 cvtsd2ss
4956IEMIMPL_FP_F2_R64 sqrtsd
4957
4958
4959;;
4960; Macro for the cvtpd2ps/cvtps2pd instructions.
4961;
4962; 1 The instruction name.
4963; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4964;
4965; @param A0 FPU context (FXSTATE or XSAVEAREA).
4966; @param A1 Where to return the result including the MXCSR value.
4967; @param A2 Pointer to the first media register size operand (input/output).
4968; @param A3 Pointer to the second media register size operand (input).
4969;
4970%macro IEMIMPL_CVT_F2 2
4971BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4972 PROLOGUE_4_ARGS
4973 IEMIMPL_SSE_PROLOGUE
4974 SSE_LD_FXSTATE_MXCSR A0
4975
4976 movdqu xmm0, [A2]
4977 movdqu xmm1, [A3]
4978 %1 xmm0, xmm1
4979 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4980
4981 SSE_ST_FXSTATE_MXCSR A1, A0
4982 IEMIMPL_SSE_EPILOGUE
4983 EPILOGUE_4_ARGS
4984ENDPROC iemAImpl_ %+ %1 %+ _u128
4985
4986BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4987 PROLOGUE_4_ARGS
4988 IEMIMPL_AVX_PROLOGUE
4989 AVX_LD_XSAVEAREA_MXCSR A0
4990
4991 vmovdqu xmm0, [A2]
4992 vmovdqu xmm1, [A3]
4993 v %+ %1 xmm0, xmm1
4994 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4995
4996 AVX128_ST_XSAVEAREA_MXCSR A1
4997 IEMIMPL_AVX_EPILOGUE
4998 EPILOGUE_4_ARGS
4999ENDPROC iemAImpl_v %+ %1 %+ _u128
5000
5001BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5002 PROLOGUE_4_ARGS
5003 IEMIMPL_AVX_PROLOGUE
5004 AVX_LD_XSAVEAREA_MXCSR A0
5005
5006 vmovdqu ymm0, [A2]
5007 vmovdqu ymm1, [A3]
5008 %if %2 == 0
5009 v %+ %1 xmm0, ymm1
5010 %else
5011 v %+ %1 ymm0, xmm1
5012 %endif
5013 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5014
5015 AVX256_ST_XSAVEAREA_MXCSR A1
5016 IEMIMPL_AVX_EPILOGUE
5017 EPILOGUE_4_ARGS
5018ENDPROC iemAImpl_v %+ %1 %+ _u256
5019%endmacro
5020
5021IEMIMPL_CVT_F2 cvtpd2ps, 0
5022IEMIMPL_CVT_F2 cvtps2pd, 1
5023
5024
5025;;
5026; shufps instructions with 8-bit immediates.
5027;
5028; @param A0 Pointer to the destination media register size operand (input/output).
5029; @param A1 Pointer to the first source media register size operand (input).
5030; @param A2 The 8-bit immediate
5031;
5032BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5033 PROLOGUE_3_ARGS
5034 IEMIMPL_SSE_PROLOGUE
5035
5036 movzx A2, A2_8 ; must clear top bits
5037 movdqu xmm0, [A0]
5038 movdqu xmm1, [A1]
5039 lea T1, [.imm0 xWrtRIP]
5040 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5041 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5042 %else
5043 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5044 %endif
5045 lea T1, [T1 + T0*2]
5046 IBT_NOTRACK
5047 call T1
5048 movdqu [A0], xmm0
5049
5050 IEMIMPL_SSE_EPILOGUE
5051 EPILOGUE_3_ARGS
5052 %assign bImm 0
5053 %rep 256
5054.imm %+ bImm:
5055 IBT_ENDBRxx_WITHOUT_NOTRACK
5056 shufps xmm0, xmm1, bImm
5057 ret
5058 int3
5059 %assign bImm bImm + 1
5060 %endrep
5061.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5062ENDPROC iemAImpl_shufps_u128
5063
5064
5065;;
5066; shufpd instruction with 8-bit immediates.
5067;
5068; @param A0 Pointer to the destination media register size operand (input/output).
5069; @param A1 Pointer to the first source media register size operand (input).
5070; @param A2 The 8-bit immediate
5071;
5072BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5073 PROLOGUE_3_ARGS
5074 IEMIMPL_SSE_PROLOGUE
5075
5076 movzx A2, A2_8 ; must clear top bits
5077 movdqu xmm0, [A0]
5078 movdqu xmm1, [A1]
5079 lea T1, [.imm0 xWrtRIP]
5080 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5081 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5082 %else
5083 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5084 %endif
5085 lea T1, [T1 + T0*2]
5086 IBT_NOTRACK
5087 call T1
5088 movdqu [A0], xmm0
5089
5090 IEMIMPL_SSE_EPILOGUE
5091 EPILOGUE_3_ARGS
5092 %assign bImm 0
5093 %rep 256
5094.imm %+ bImm:
5095 IBT_ENDBRxx_WITHOUT_NOTRACK
5096 shufpd xmm0, xmm1, bImm
5097 ret
5098 %assign bImm bImm + 1
5099 %endrep
5100.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5101ENDPROC iemAImpl_shufpd_u128
5102
5103
5104;;
5105; vshufp{s,d} instructions with 8-bit immediates.
5106;
5107; @param 1 The instruction name.
5108;
5109; @param A0 Pointer to the destination media register size operand (output).
5110; @param A1 Pointer to the first source media register size operand (input).
5111; @param A2 Pointer to the second source media register size operand (input).
5112; @param A3 The 8-bit immediate
5113;
5114%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5116 PROLOGUE_4_ARGS
5117 IEMIMPL_AVX_PROLOGUE
5118
5119 movzx A3, A3_8 ; must clear top bits
5120 movdqu xmm0, [A1]
5121 movdqu xmm1, [A2]
5122 lea T1, [.imm0 xWrtRIP]
5123 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5124 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5125 %else
5126 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5127 %endif
5128 lea T1, [T1 + T0*2]
5129 IBT_NOTRACK
5130 call T1
5131 movdqu [A0], xmm0
5132
5133 IEMIMPL_AVX_EPILOGUE
5134 EPILOGUE_4_ARGS
5135 %assign bImm 0
5136 %rep 256
5137.imm %+ bImm:
5138 IBT_ENDBRxx_WITHOUT_NOTRACK
5139 %1 xmm0, xmm0, xmm1, bImm
5140 ret
5141 %assign bImm bImm + 1
5142 %endrep
5143.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5144ENDPROC iemAImpl_ %+ %1 %+ _u128
5145
5146BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5147 PROLOGUE_4_ARGS
5148 IEMIMPL_AVX_PROLOGUE
5149
5150 movzx A3, A3_8 ; must clear top bits
5151 vmovdqu ymm0, [A1]
5152 vmovdqu ymm1, [A2]
5153 lea T1, [.imm0 xWrtRIP]
5154 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5155 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5156 %else
5157 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5158 %endif
5159 lea T1, [T1 + T0*2]
5160 IBT_NOTRACK
5161 call T1
5162 vmovdqu [A0], ymm0
5163
5164 IEMIMPL_AVX_EPILOGUE
5165 EPILOGUE_4_ARGS
5166 %assign bImm 0
5167 %rep 256
5168.imm %+ bImm:
5169 IBT_ENDBRxx_WITHOUT_NOTRACK
5170 %1 ymm0, ymm0, ymm1, bImm
5171 ret
5172 %assign bImm bImm + 1
5173 %endrep
5174.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5175ENDPROC iemAImpl_ %+ %1 %+ _u256
5176%endmacro
5177
5178IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5179IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5180
5181
5182;;
5183; One of the [p]blendv{b,ps,pd} variants
5184;
5185; @param 1 The instruction
5186;
5187; @param A0 Pointer to the first media register sized operand (input/output).
5188; @param A1 Pointer to the second media sized value (input).
5189; @param A2 Pointer to the media register sized mask value (input).
5190;
5191%macro IEMIMPL_P_BLEND 1
5192BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5193 PROLOGUE_3_ARGS
5194 IEMIMPL_SSE_PROLOGUE
5195
5196 movdqu xmm0, [A2] ; This is implicit
5197 movdqu xmm1, [A0]
5198 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5199 %1 xmm1, xmm2
5200 movdqu [A0], xmm1
5201
5202 IEMIMPL_SSE_PROLOGUE
5203 EPILOGUE_3_ARGS
5204ENDPROC iemAImpl_ %+ %1 %+ _u128
5205%endmacro
5206
5207IEMIMPL_P_BLEND pblendvb
5208IEMIMPL_P_BLEND blendvps
5209IEMIMPL_P_BLEND blendvpd
5210
5211
5212;;
5213; One of the v[p]blendv{b,ps,pd} variants
5214;
5215; @param 1 The instruction
5216;
5217; @param A0 Pointer to the first media register sized operand (output).
5218; @param A1 Pointer to the first media register sized operand (input).
5219; @param A2 Pointer to the second media register sized operand (input).
5220; @param A3 Pointer to the media register sized mask value (input).
5221%macro IEMIMPL_AVX_P_BLEND 1
5222BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5223 PROLOGUE_4_ARGS
5224 IEMIMPL_AVX_PROLOGUE
5225
5226 vmovdqu xmm0, [A1]
5227 vmovdqu xmm1, [A2]
5228 vmovdqu xmm2, [A3]
5229 %1 xmm0, xmm0, xmm1, xmm2
5230 vmovdqu [A0], xmm0
5231
5232 IEMIMPL_AVX_PROLOGUE
5233 EPILOGUE_4_ARGS
5234ENDPROC iemAImpl_ %+ %1 %+ _u128
5235
5236BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5237 PROLOGUE_4_ARGS
5238 IEMIMPL_AVX_PROLOGUE
5239
5240 vmovdqu ymm0, [A1]
5241 vmovdqu ymm1, [A2]
5242 vmovdqu ymm2, [A3]
5243 %1 ymm0, ymm0, ymm1, ymm2
5244 vmovdqu [A0], ymm0
5245
5246 IEMIMPL_AVX_PROLOGUE
5247 EPILOGUE_4_ARGS
5248ENDPROC iemAImpl_ %+ %1 %+ _u256
5249%endmacro
5250
5251IEMIMPL_AVX_P_BLEND vpblendvb
5252IEMIMPL_AVX_P_BLEND vblendvps
5253IEMIMPL_AVX_P_BLEND vblendvpd
5254
5255
5256;;
5257; palignr mm1, mm2/m64 instruction.
5258;
5259; @param A0 Pointer to the first media register sized operand (output).
5260; @param A1 The second register sized operand (input).
5261; @param A2 The 8-bit immediate.
5262BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5263 PROLOGUE_3_ARGS
5264 IEMIMPL_MMX_PROLOGUE
5265
5266 movzx A2, A2_8 ; must clear top bits
5267 movq mm0, [A0]
5268 movq mm1, A1
5269 lea T1, [.imm0 xWrtRIP]
5270 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5271 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5272 %else
5273 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5274 %endif
5275 lea T1, [T1 + T0*2]
5276 IBT_NOTRACK
5277 call T1
5278 movq [A0], mm0
5279
5280 IEMIMPL_MMX_EPILOGUE
5281 EPILOGUE_3_ARGS
5282 %assign bImm 0
5283 %rep 256
5284.imm %+ bImm:
5285 IBT_ENDBRxx_WITHOUT_NOTRACK
5286 palignr mm0, mm1, bImm
5287 ret
5288 %assign bImm bImm + 1
5289 %endrep
5290.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5291ENDPROC iemAImpl_palignr_u64
5292
5293
5294;;
5295; SSE instructions with 8-bit immediates of the form
5296; xxx xmm1, xmm2, imm8.
5297; where the instruction encoding takes up 6 bytes.
5298;
5299; @param 1 The instruction name.
5300;
5301; @param A0 Pointer to the first media register size operand (input/output).
5302; @param A1 Pointer to the second source media register size operand (input).
5303; @param A2 The 8-bit immediate
5304;
5305%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5306BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5307 PROLOGUE_3_ARGS
5308 IEMIMPL_SSE_PROLOGUE
5309
5310 movzx A2, A2_8 ; must clear top bits
5311 movdqu xmm0, [A0]
5312 movdqu xmm1, [A1]
5313 lea T1, [.imm0 xWrtRIP]
5314 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5315 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5316 lea T1, [T1 + T0*4]
5317 %else
5318 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5319 %endif
5320 IBT_NOTRACK
5321 call T1
5322 movdqu [A0], xmm0
5323
5324 IEMIMPL_SSE_EPILOGUE
5325 EPILOGUE_3_ARGS
5326 %assign bImm 0
5327 %rep 256
5328.imm %+ bImm:
5329 IBT_ENDBRxx_WITHOUT_NOTRACK
5330 %1 xmm0, xmm1, bImm
5331 ret
5332 int3
5333 %assign bImm bImm + 1
5334 %endrep
5335.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5336ENDPROC iemAImpl_ %+ %1 %+ _u128
5337%endmacro
5338
5339IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5340IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5341IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5342IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5343IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5344IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5345IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5346
5347
5348;;
5349; AVX instructions with 8-bit immediates of the form
5350; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5351; where the instruction encoding takes up 6 bytes.
5352;
5353; @param 1 The instruction name.
5354; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5355; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5356;
5357; @param A0 Pointer to the destination media register size operand (output).
5358; @param A1 Pointer to the first source media register size operand (input).
5359; @param A2 Pointer to the second source media register size operand (input).
5360; @param A3 The 8-bit immediate
5361;
5362%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5363 %if %2 == 1
5364BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5365 PROLOGUE_4_ARGS
5366 IEMIMPL_AVX_PROLOGUE
5367
5368 movzx A3, A3_8 ; must clear top bits
5369 movdqu xmm0, [A1]
5370 movdqu xmm1, [A2]
5371 lea T1, [.imm0 xWrtRIP]
5372 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5373 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5374 lea T1, [T1 + T0*4]
5375 %else
5376 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5377 %endif
5378 IBT_NOTRACK
5379 call T1
5380 movdqu [A0], xmm0
5381
5382 IEMIMPL_AVX_EPILOGUE
5383 EPILOGUE_4_ARGS
5384 %assign bImm 0
5385 %rep 256
5386.imm %+ bImm:
5387 IBT_ENDBRxx_WITHOUT_NOTRACK
5388 %1 xmm0, xmm0, xmm1, bImm
5389 ret
5390 int3
5391 %assign bImm bImm + 1
5392 %endrep
5393.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5394ENDPROC iemAImpl_ %+ %1 %+ _u128
5395 %endif
5396
5397 %if %3 == 1
5398BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5399 PROLOGUE_4_ARGS
5400 IEMIMPL_AVX_PROLOGUE
5401
5402 movzx A3, A3_8 ; must clear top bits
5403 vmovdqu ymm0, [A1]
5404 vmovdqu ymm1, [A2]
5405 lea T1, [.imm0 xWrtRIP]
5406 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5407 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5408 lea T1, [T1 + T0*4]
5409 %else
5410 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5411 %endif
5412 IBT_NOTRACK
5413 call T1
5414 vmovdqu [A0], ymm0
5415
5416 IEMIMPL_AVX_EPILOGUE
5417 EPILOGUE_4_ARGS
5418 %assign bImm 0
5419 %rep 256
5420.imm %+ bImm:
5421 IBT_ENDBRxx_WITHOUT_NOTRACK
5422 %1 ymm0, ymm0, ymm1, bImm
5423 ret
5424 int3
5425 %assign bImm bImm + 1
5426 %endrep
5427.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5428ENDPROC iemAImpl_ %+ %1 %+ _u256
5429 %endif
5430%endmacro
5431
5432IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5433IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5434IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5435IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5436IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5437IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5438IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5439IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5440
5441
5442;;
5443; Need to move this as well somewhere better?
5444;
5445struc IEMPCMPISTRXSRC
5446 .uSrc1 resd 4
5447 .uSrc2 resd 4
5448endstruc
5449
5450struc IEMPCMPESTRXSRC
5451 .uSrc1 resd 4
5452 .uSrc2 resd 4
5453 .u64Rax resd 2
5454 .u64Rdx resd 2
5455endstruc
5456
5457;;
5458; The pcmpistri instruction.
5459;
5460; @param A0 Pointer to the ECX register to store the result to (output).
5461; @param A1 Pointer to the EFLAGS register.
5462; @param A2 Pointer to the structure containing the source operands (input).
5463; @param A3 The 8-bit immediate
5464;
5465BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5466 PROLOGUE_4_ARGS
5467 IEMIMPL_SSE_PROLOGUE
5468
5469 movzx A3, A3_8 ; must clear top bits
5470 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5471 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5472 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5473 lea T1, [.imm0 xWrtRIP]
5474 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5475 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5476 lea T1, [T1 + T0*4]
5477 %else
5478 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5479 %endif
5480 IBT_NOTRACK
5481 call T1
5482
5483 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5484 mov [T2], ecx
5485
5486 IEMIMPL_SSE_EPILOGUE
5487 EPILOGUE_4_ARGS
5488 %assign bImm 0
5489 %rep 256
5490.imm %+ bImm:
5491 IBT_ENDBRxx_WITHOUT_NOTRACK
5492 pcmpistri xmm0, xmm1, bImm
5493 ret
5494 int3
5495 %assign bImm bImm + 1
5496 %endrep
5497.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5498ENDPROC iemAImpl_pcmpistri_u128
5499
5500;;
5501; The pcmpestri instruction.
5502;
5503; @param A0 Pointer to the ECX register to store the result to (output).
5504; @param A1 Pointer to the EFLAGS register.
5505; @param A2 Pointer to the structure containing the source operands (input).
5506; @param A3 The 8-bit immediate
5507;
5508BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5509 PROLOGUE_4_ARGS
5510 IEMIMPL_SSE_PROLOGUE
5511
5512 movzx A3, A3_8 ; must clear top bits
5513 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5514 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5515 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5516 lea T1, [.imm0 xWrtRIP]
5517 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5518 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5519 lea T1, [T1 + T0*4]
5520 %else
5521 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5522 %endif
5523 push xDX ; xDX can be A1 or A2 depending on the calling convention
5524 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5525 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5526 IBT_NOTRACK
5527 call T1
5528
5529 pop xDX
5530 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5531 mov [T2], ecx
5532
5533 IEMIMPL_SSE_EPILOGUE
5534 EPILOGUE_4_ARGS
5535 %assign bImm 0
5536 %rep 256
5537.imm %+ bImm:
5538 IBT_ENDBRxx_WITHOUT_NOTRACK
5539 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5540 pcmpestri xmm0, xmm1, bImm
5541 ret
5542 %assign bImm bImm + 1
5543 %endrep
5544.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5545ENDPROC iemAImpl_pcmpestri_u128
5546
5547;;
5548; The pcmpistrm instruction template.
5549;
5550; @param A0 Pointer to the XMM0 register to store the result to (output).
5551; @param A1 Pointer to the EFLAGS register.
5552; @param A2 Pointer to the structure containing the source operands (input).
5553; @param A3 The 8-bit immediate
5554;
5555BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5556 PROLOGUE_4_ARGS
5557 IEMIMPL_SSE_PROLOGUE
5558
5559 movzx A3, A3_8 ; must clear top bits
5560 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5561 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5562 lea T1, [.imm0 xWrtRIP]
5563 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5564 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5565 lea T1, [T1 + T0*4]
5566 %else
5567 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5568 %endif
5569 IBT_NOTRACK
5570 call T1
5571
5572 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5573 movdqu [A0], xmm0
5574
5575 IEMIMPL_SSE_EPILOGUE
5576 EPILOGUE_4_ARGS
5577 %assign bImm 0
5578 %rep 256
5579.imm %+ bImm:
5580 IBT_ENDBRxx_WITHOUT_NOTRACK
5581 pcmpistrm xmm1, xmm2, bImm
5582 ret
5583 int3
5584 %assign bImm bImm + 1
5585 %endrep
5586.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5587ENDPROC iemAImpl_pcmpistrm_u128
5588
5589;;
5590; The pcmpestrm instruction template.
5591;
5592; @param A0 Pointer to the XMM0 register to store the result to (output).
5593; @param A1 Pointer to the EFLAGS register.
5594; @param A2 Pointer to the structure containing the source operands (input).
5595; @param A3 The 8-bit immediate
5596;
5597BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5598 PROLOGUE_4_ARGS
5599 IEMIMPL_SSE_PROLOGUE
5600
5601 movzx A3, A3_8 ; must clear top bits
5602 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5603 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5604 lea T1, [.imm0 xWrtRIP]
5605 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5606 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5607 lea T1, [T1 + T0*4]
5608 %else
5609 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5610 %endif
5611 push xDX ; xDX can be A1 or A2 depending on the calling convention
5612 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5613 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5614 IBT_NOTRACK
5615 call T1
5616
5617 pop xDX
5618 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5619 movdqu [A0], xmm0
5620
5621 IEMIMPL_SSE_EPILOGUE
5622 EPILOGUE_4_ARGS
5623 %assign bImm 0
5624 %rep 256
5625.imm %+ bImm:
5626 IBT_ENDBRxx_WITHOUT_NOTRACK
5627 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5628 pcmpestrm xmm1, xmm2, bImm
5629 ret
5630 %assign bImm bImm + 1
5631 %endrep
5632.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5633ENDPROC iemAImpl_pcmpestrm_u128
5634
5635
5636;;
5637; pinsrw instruction.
5638;
5639; @param A0 Pointer to the first media register size operand (input/output).
5640; @param A1 The 16 bit input operand (input).
5641; @param A2 The 8-bit immediate
5642;
5643BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5644 PROLOGUE_3_ARGS
5645 IEMIMPL_SSE_PROLOGUE
5646
5647 movzx A2, A2_8 ; must clear top bits
5648 movq mm0, [A0]
5649 lea T1, [.imm0 xWrtRIP]
5650 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5651 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5652 %else
5653 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5654 %endif
5655 lea T1, [T1 + T0]
5656 IBT_NOTRACK
5657 call T1
5658 movq [A0], mm0
5659
5660 IEMIMPL_SSE_EPILOGUE
5661 EPILOGUE_3_ARGS
5662 %assign bImm 0
5663 %rep 256
5664.imm %+ bImm:
5665 IBT_ENDBRxx_WITHOUT_NOTRACK
5666 pinsrw mm0, A1_32, bImm
5667 ret
5668 %assign bImm bImm + 1
5669 %endrep
5670.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5671ENDPROC iemAImpl_pinsrw_u64
5672
5673BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5674 PROLOGUE_3_ARGS
5675 IEMIMPL_SSE_PROLOGUE
5676
5677 movzx A2, A2_8 ; must clear top bits
5678 movdqu xmm0, [A0]
5679 lea T1, [.imm0 xWrtRIP]
5680 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5681 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5682 %else
5683 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5684 %endif
5685 lea T1, [T1 + T0*2]
5686 IBT_NOTRACK
5687 call T1
5688 movdqu [A0], xmm0
5689
5690 IEMIMPL_SSE_EPILOGUE
5691 EPILOGUE_3_ARGS
5692 %assign bImm 0
5693 %rep 256
5694.imm %+ bImm:
5695 IBT_ENDBRxx_WITHOUT_NOTRACK
5696 pinsrw xmm0, A1_32, bImm
5697 ret
5698 %assign bImm bImm + 1
5699 %endrep
5700.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5701ENDPROC iemAImpl_pinsrw_u128
5702
5703;;
5704; vpinsrw instruction.
5705;
5706; @param A0 Pointer to the first media register size operand (output).
5707; @param A1 Pointer to the source media register size operand (input).
5708; @param A2 The 16 bit input operand (input).
5709; @param A3 The 8-bit immediate
5710;
5711BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5712 PROLOGUE_4_ARGS
5713 IEMIMPL_SSE_PROLOGUE
5714
5715 movzx A3, A3_8 ; must clear top bits
5716 movdqu xmm0, [A1]
5717 lea T1, [.imm0 xWrtRIP]
5718 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5719 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5720 %else
5721 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5722 %endif
5723 lea T1, [T1 + T0*2]
5724 mov A1, A2 ; A2 requires longer encoding on Windows
5725 IBT_NOTRACK
5726 call T1
5727 movdqu [A0], xmm0
5728
5729 IEMIMPL_SSE_EPILOGUE
5730 EPILOGUE_4_ARGS
5731 %assign bImm 0
5732 %rep 256
5733.imm %+ bImm:
5734 IBT_ENDBRxx_WITHOUT_NOTRACK
5735 vpinsrw xmm0, xmm0, A1_32, bImm
5736 ret
5737 %assign bImm bImm + 1
5738 %endrep
5739.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5740ENDPROC iemAImpl_vpinsrw_u128
5741
5742
5743;;
5744; pextrw instruction.
5745;
5746; @param A0 Pointer to the 16bit output operand (output).
5747; @param A1 Pointer to the media register size operand (input).
5748; @param A2 The 8-bit immediate
5749;
5750BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5751 PROLOGUE_3_ARGS
5752 IEMIMPL_SSE_PROLOGUE
5753
5754 movzx A2, A2_8 ; must clear top bits
5755 movq mm0, A1
5756 lea T1, [.imm0 xWrtRIP]
5757 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5758 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5759 %else
5760 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5761 %endif
5762 lea T1, [T1 + T0]
5763 IBT_NOTRACK
5764 call T1
5765 mov word [A0], T0_16
5766
5767 IEMIMPL_SSE_EPILOGUE
5768 EPILOGUE_3_ARGS
5769 %assign bImm 0
5770 %rep 256
5771.imm %+ bImm:
5772 IBT_ENDBRxx_WITHOUT_NOTRACK
5773 pextrw T0_32, mm0, bImm
5774 ret
5775 %assign bImm bImm + 1
5776 %endrep
5777.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5778ENDPROC iemAImpl_pextrw_u64
5779
5780BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5781 PROLOGUE_3_ARGS
5782 IEMIMPL_SSE_PROLOGUE
5783
5784 movzx A2, A2_8 ; must clear top bits
5785 movdqu xmm0, [A1]
5786 lea T1, [.imm0 xWrtRIP]
5787 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5788 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5789 %else
5790 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5791 %endif
5792 lea T1, [T1 + T0*2]
5793 IBT_NOTRACK
5794 call T1
5795 mov word [A0], T0_16
5796
5797 IEMIMPL_SSE_EPILOGUE
5798 EPILOGUE_3_ARGS
5799 %assign bImm 0
5800 %rep 256
5801.imm %+ bImm:
5802 IBT_ENDBRxx_WITHOUT_NOTRACK
5803 pextrw T0_32, xmm0, bImm
5804 ret
5805 %assign bImm bImm + 1
5806 %endrep
5807.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5808ENDPROC iemAImpl_pextrw_u128
5809
5810;;
5811; vpextrw instruction.
5812;
5813; @param A0 Pointer to the 16bit output operand (output).
5814; @param A1 Pointer to the source media register size operand (input).
5815; @param A2 The 8-bit immediate
5816;
5817BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5818 PROLOGUE_3_ARGS
5819 IEMIMPL_SSE_PROLOGUE
5820
5821 movzx A2, A2_8 ; must clear top bits
5822 movdqu xmm0, [A1]
5823 lea T1, [.imm0 xWrtRIP]
5824 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5825 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5826 %else
5827 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5828 %endif
5829 lea T1, [T1 + T0*2]
5830 IBT_NOTRACK
5831 call T1
5832 mov word [A0], T0_16
5833
5834 IEMIMPL_SSE_EPILOGUE
5835 EPILOGUE_3_ARGS
5836 %assign bImm 0
5837 %rep 256
5838.imm %+ bImm:
5839 IBT_ENDBRxx_WITHOUT_NOTRACK
5840 vpextrw T0_32, xmm0, bImm
5841 ret
5842 %assign bImm bImm + 1
5843 %endrep
5844.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5845ENDPROC iemAImpl_vpextrw_u128
5846
5847
5848;;
5849; movmskp{s,d} SSE instruction template
5850;
5851; @param 1 The SSE instruction name.
5852; @param 2 The AVX instruction name.
5853;
5854; @param A0 Pointer to the output register (output/byte sized).
5855; @param A1 Pointer to the source media register size operand (input).
5856;
5857%macro IEMIMPL_MEDIA_MOVMSK_P 2
5858BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5859 PROLOGUE_2_ARGS
5860 IEMIMPL_SSE_PROLOGUE
5861
5862 movdqu xmm0, [A1]
5863 %1 T0, xmm0
5864 mov byte [A0], T0_8
5865
5866 IEMIMPL_SSE_EPILOGUE
5867 EPILOGUE_2_ARGS
5868ENDPROC iemAImpl_ %+ %1 %+ _u128
5869
5870BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5871 PROLOGUE_2_ARGS
5872 IEMIMPL_AVX_PROLOGUE
5873
5874 movdqu xmm0, [A1]
5875 %2 T0, xmm0
5876 mov byte [A0], T0_8
5877
5878 IEMIMPL_AVX_EPILOGUE
5879 EPILOGUE_2_ARGS
5880ENDPROC iemAImpl_ %+ %2 %+ _u128
5881
5882BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5883 PROLOGUE_2_ARGS
5884 IEMIMPL_AVX_PROLOGUE
5885
5886 vmovdqu ymm0, [A1]
5887 %2 T0, ymm0
5888 mov byte [A0], T0_8
5889
5890 IEMIMPL_AVX_EPILOGUE
5891 EPILOGUE_2_ARGS
5892ENDPROC iemAImpl_ %+ %2 %+ _u256
5893%endmacro
5894
5895IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5896IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5897
5898
5899;;
5900; Restores the SSE MXCSR register with the original value.
5901;
5902; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5903; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5904; @param 2 Expression giving the address of the FXSTATE of the guest.
5905;
5906; @note Restores the stack pointer.
5907;
5908%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5909 sub xSP, 4
5910 stmxcsr [xSP]
5911 mov T0_32, [xSP]
5912 add xSP, 4
5913 ; Merge the status bits into the original MXCSR value.
5914 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5915 and T0_32, X86_MXCSR_XCPT_FLAGS
5916 or T0_32, T1_32
5917 mov [%1], T0_32
5918
5919 ldmxcsr [xSP]
5920 add xSP, 4
5921%endmacro
5922
5923
5924;;
5925; cvttsd2si instruction - 32-bit variant.
5926;
5927; @param A0 FPU context (FXSTATE or XSAVEAREA).
5928; @param A1 Where to return the MXCSR value.
5929; @param A2 Pointer to the result operand (output).
5930; @param A3 Pointer to the second operand (input).
5931;
5932BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5933 PROLOGUE_4_ARGS
5934 IEMIMPL_SSE_PROLOGUE
5935 SSE_LD_FXSTATE_MXCSR A0
5936
5937 cvttsd2si T0_32, [A3]
5938 mov dword [A2], T0_32
5939
5940 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5941 IEMIMPL_SSE_EPILOGUE
5942 EPILOGUE_4_ARGS
5943ENDPROC iemAImpl_cvttsd2si_i32_r64
5944
5945;;
5946; cvttsd2si instruction - 64-bit variant.
5947;
5948; @param A0 FPU context (FXSTATE or XSAVEAREA).
5949; @param A1 Where to return the MXCSR value.
5950; @param A2 Pointer to the result operand (output).
5951; @param A3 Pointer to the second operand (input).
5952;
5953BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5954 PROLOGUE_4_ARGS
5955 IEMIMPL_SSE_PROLOGUE
5956 SSE_LD_FXSTATE_MXCSR A0
5957
5958 cvttsd2si T0, [A3]
5959 mov qword [A2], T0
5960
5961 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5962 IEMIMPL_SSE_EPILOGUE
5963 EPILOGUE_4_ARGS
5964ENDPROC iemAImpl_cvttsd2si_i64_r64
5965
5966
5967;;
5968; cvtsd2si instruction - 32-bit variant.
5969;
5970; @param A0 FPU context (FXSTATE or XSAVEAREA).
5971; @param A1 Where to return the MXCSR value.
5972; @param A2 Pointer to the result operand (output).
5973; @param A3 Pointer to the second operand (input).
5974;
5975BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5976 PROLOGUE_4_ARGS
5977 IEMIMPL_SSE_PROLOGUE
5978 SSE_LD_FXSTATE_MXCSR A0
5979
5980 cvtsd2si T0_32, [A3]
5981 mov dword [A2], T0_32
5982
5983 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5984 IEMIMPL_SSE_EPILOGUE
5985 EPILOGUE_4_ARGS
5986ENDPROC iemAImpl_cvtsd2si_i32_r64
5987
5988;;
5989; cvtsd2si instruction - 64-bit variant.
5990;
5991; @param A0 FPU context (FXSTATE or XSAVEAREA).
5992; @param A1 Where to return the MXCSR value.
5993; @param A2 Pointer to the result operand (output).
5994; @param A3 Pointer to the second operand (input).
5995;
5996BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5997 PROLOGUE_4_ARGS
5998 IEMIMPL_SSE_PROLOGUE
5999 SSE_LD_FXSTATE_MXCSR A0
6000
6001 cvtsd2si T0, [A3]
6002 mov qword [A2], T0
6003
6004 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6005 IEMIMPL_SSE_EPILOGUE
6006 EPILOGUE_4_ARGS
6007ENDPROC iemAImpl_cvtsd2si_i64_r64
6008
6009
6010;;
6011; cvttss2si instruction - 32-bit variant.
6012;
6013; @param A0 FPU context (FXSTATE or XSAVEAREA).
6014; @param A1 Where to return the MXCSR value.
6015; @param A2 Pointer to the result operand (output).
6016; @param A3 Pointer to the second operand (input).
6017;
6018BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6019 PROLOGUE_4_ARGS
6020 IEMIMPL_SSE_PROLOGUE
6021 SSE_LD_FXSTATE_MXCSR A0
6022
6023 cvttss2si T0_32, [A3]
6024 mov dword [A2], T0_32
6025
6026 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6027 IEMIMPL_SSE_EPILOGUE
6028 EPILOGUE_4_ARGS
6029ENDPROC iemAImpl_cvttss2si_i32_r32
6030
6031;;
6032; cvttss2si instruction - 64-bit variant.
6033;
6034; @param A0 FPU context (FXSTATE or XSAVEAREA).
6035; @param A1 Where to return the MXCSR value.
6036; @param A2 Pointer to the result operand (output).
6037; @param A3 Pointer to the second operand (input).
6038;
6039BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6040 PROLOGUE_4_ARGS
6041 IEMIMPL_SSE_PROLOGUE
6042 SSE_LD_FXSTATE_MXCSR A0
6043
6044 cvttss2si T0, [A3]
6045 mov qword [A2], T0
6046
6047 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6048 IEMIMPL_SSE_EPILOGUE
6049 EPILOGUE_4_ARGS
6050ENDPROC iemAImpl_cvttss2si_i64_r32
6051
6052
6053;;
6054; cvtss2si instruction - 32-bit variant.
6055;
6056; @param A0 FPU context (FXSTATE or XSAVEAREA).
6057; @param A1 Where to return the MXCSR value.
6058; @param A2 Pointer to the result operand (output).
6059; @param A3 Pointer to the second operand (input).
6060;
6061BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6062 PROLOGUE_4_ARGS
6063 IEMIMPL_SSE_PROLOGUE
6064 SSE_LD_FXSTATE_MXCSR A0
6065
6066 cvtss2si T0_32, [A3]
6067 mov dword [A2], T0_32
6068
6069 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6070 IEMIMPL_SSE_EPILOGUE
6071 EPILOGUE_4_ARGS
6072ENDPROC iemAImpl_cvtss2si_i32_r32
6073
6074;;
6075; cvtss2si instruction - 64-bit variant.
6076;
6077; @param A0 FPU context (FXSTATE or XSAVEAREA).
6078; @param A1 Where to return the MXCSR value.
6079; @param A2 Pointer to the result operand (output).
6080; @param A3 Pointer to the second operand (input).
6081;
6082BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6083 PROLOGUE_4_ARGS
6084 IEMIMPL_SSE_PROLOGUE
6085 SSE_LD_FXSTATE_MXCSR A0
6086
6087 cvtss2si T0, [A3]
6088 mov qword [A2], T0
6089
6090 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6091 IEMIMPL_SSE_EPILOGUE
6092 EPILOGUE_4_ARGS
6093ENDPROC iemAImpl_cvtss2si_i64_r32
6094
6095
6096;;
6097; cvtsi2ss instruction - 32-bit variant.
6098;
6099; @param A0 FPU context (FXSTATE or XSAVEAREA).
6100; @param A1 Where to return the MXCSR value.
6101; @param A2 Pointer to the result operand (output).
6102; @param A3 Pointer to the second operand (input).
6103;
6104BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6105 PROLOGUE_4_ARGS
6106 IEMIMPL_SSE_PROLOGUE
6107 SSE_LD_FXSTATE_MXCSR A0
6108
6109 cvtsi2ss xmm0, dword [A3]
6110 movd dword [A2], xmm0
6111
6112 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6113 IEMIMPL_SSE_EPILOGUE
6114 EPILOGUE_4_ARGS
6115ENDPROC iemAImpl_cvtsi2ss_r32_i32
6116
6117;;
6118; cvtsi2ss instruction - 64-bit variant.
6119;
6120; @param A0 FPU context (FXSTATE or XSAVEAREA).
6121; @param A1 Where to return the MXCSR value.
6122; @param A2 Pointer to the result operand (output).
6123; @param A3 Pointer to the second operand (input).
6124;
6125BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6126 PROLOGUE_4_ARGS
6127 IEMIMPL_SSE_PROLOGUE
6128 SSE_LD_FXSTATE_MXCSR A0
6129
6130 cvtsi2ss xmm0, qword [A3]
6131 movd dword [A2], xmm0
6132
6133 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6134 IEMIMPL_SSE_EPILOGUE
6135 EPILOGUE_4_ARGS
6136ENDPROC iemAImpl_cvtsi2ss_r32_i64
6137
6138
6139;;
6140; cvtsi2sd instruction - 32-bit variant.
6141;
6142; @param A0 FPU context (FXSTATE or XSAVEAREA).
6143; @param A1 Where to return the MXCSR value.
6144; @param A2 Pointer to the result operand (output).
6145; @param A3 Pointer to the second operand (input).
6146;
6147BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6148 PROLOGUE_4_ARGS
6149 IEMIMPL_SSE_PROLOGUE
6150 SSE_LD_FXSTATE_MXCSR A0
6151
6152 cvtsi2sd xmm0, dword [A3]
6153 movq [A2], xmm0
6154
6155 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6156 IEMIMPL_SSE_EPILOGUE
6157 EPILOGUE_4_ARGS
6158ENDPROC iemAImpl_cvtsi2sd_r64_i32
6159
6160;;
6161; cvtsi2sd instruction - 64-bit variant.
6162;
6163; @param A0 FPU context (FXSTATE or XSAVEAREA).
6164; @param A1 Where to return the MXCSR value.
6165; @param A2 Pointer to the result operand (output).
6166; @param A3 Pointer to the second operand (input).
6167;
6168BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6169 PROLOGUE_4_ARGS
6170 IEMIMPL_SSE_PROLOGUE
6171 SSE_LD_FXSTATE_MXCSR A0
6172
6173 cvtsi2sd xmm0, qword [A3]
6174 movq [A2], xmm0
6175
6176 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6177 IEMIMPL_SSE_EPILOGUE
6178 EPILOGUE_4_ARGS
6179ENDPROC iemAImpl_cvtsi2sd_r64_i64
6180
6181
6182;;
6183; Initialize the SSE MXCSR register using the guest value partially to
6184; account for rounding mode.
6185;
6186; @uses 4 bytes of stack to save the original value, T0.
6187; @param 1 Expression giving the address of the MXCSR register of the guest.
6188;
6189%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6190 sub xSP, 4
6191
6192 stmxcsr [xSP]
6193 mov T0_32, [%1]
6194 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6195 or T0_32, X86_MXCSR_XCPT_MASK
6196 sub xSP, 4
6197 mov [xSP], T0_32
6198 ldmxcsr [xSP]
6199 add xSP, 4
6200%endmacro
6201
6202
6203;;
6204; Restores the SSE MXCSR register with the original value.
6205;
6206; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6207; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6208;
6209; @note Restores the stack pointer.
6210;
6211%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6212 sub xSP, 4
6213 stmxcsr [xSP]
6214 mov T0_32, [xSP]
6215 add xSP, 4
6216 ; Merge the status bits into the original MXCSR value.
6217 mov T1_32, [%1]
6218 and T0_32, X86_MXCSR_XCPT_FLAGS
6219 or T0_32, T1_32
6220 mov [%1], T0_32
6221
6222 ldmxcsr [xSP]
6223 add xSP, 4
6224%endmacro
6225
6226
6227;
6228; UCOMISS (SSE)
6229;
6230; @param A0 Pointer to the MXCSR value (input/output).
6231; @param A1 Pointer to the EFLAGS value (input/output).
6232; @param A2 Pointer to the first source operand (aka readonly destination).
6233; @param A3 Pointer to the second source operand.
6234;
6235BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6236 PROLOGUE_4_ARGS
6237 IEMIMPL_SSE_PROLOGUE
6238 SSE_LD_FXSTATE_MXCSR_ONLY A0
6239
6240 movdqu xmm0, [A2]
6241 movdqu xmm1, [A3]
6242 ucomiss xmm0, xmm1
6243 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6244
6245 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6246 IEMIMPL_SSE_EPILOGUE
6247 EPILOGUE_4_ARGS
6248ENDPROC iemAImpl_ucomiss_u128
6249
6250BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6251 PROLOGUE_4_ARGS
6252 IEMIMPL_SSE_PROLOGUE
6253 SSE_LD_FXSTATE_MXCSR_ONLY A0
6254
6255 movdqu xmm0, [A2]
6256 movdqu xmm1, [A3]
6257 vucomiss xmm0, xmm1
6258 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6259
6260 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6261 IEMIMPL_SSE_EPILOGUE
6262 EPILOGUE_4_ARGS
6263ENDPROC iemAImpl_vucomiss_u128
6264
6265
6266;
6267; UCOMISD (SSE)
6268;
6269; @param A0 Pointer to the MXCSR value (input/output).
6270; @param A1 Pointer to the EFLAGS value (input/output).
6271; @param A2 Pointer to the first source operand (aka readonly destination).
6272; @param A3 Pointer to the second source operand.
6273;
6274BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6275 PROLOGUE_4_ARGS
6276 IEMIMPL_SSE_PROLOGUE
6277 SSE_LD_FXSTATE_MXCSR_ONLY A0
6278
6279 movdqu xmm0, [A2]
6280 movdqu xmm1, [A3]
6281 ucomisd xmm0, xmm1
6282 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6283
6284 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6285 IEMIMPL_SSE_EPILOGUE
6286 EPILOGUE_4_ARGS
6287ENDPROC iemAImpl_ucomisd_u128
6288
6289BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6290 PROLOGUE_4_ARGS
6291 IEMIMPL_SSE_PROLOGUE
6292 SSE_LD_FXSTATE_MXCSR_ONLY A0
6293
6294 movdqu xmm0, [A2]
6295 movdqu xmm1, [A3]
6296 vucomisd xmm0, xmm1
6297 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6298
6299 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6300 IEMIMPL_SSE_EPILOGUE
6301 EPILOGUE_4_ARGS
6302ENDPROC iemAImpl_vucomisd_u128
6303
6304;
6305; COMISS (SSE)
6306;
6307; @param A0 Pointer to the MXCSR value (input/output).
6308; @param A1 Pointer to the EFLAGS value (input/output).
6309; @param A2 Pointer to the first source operand (aka readonly destination).
6310; @param A3 Pointer to the second source operand.
6311;
6312BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6313 PROLOGUE_4_ARGS
6314 IEMIMPL_SSE_PROLOGUE
6315 SSE_LD_FXSTATE_MXCSR_ONLY A0
6316
6317 movdqu xmm0, [A2]
6318 movdqu xmm1, [A3]
6319 comiss xmm0, xmm1
6320 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6321
6322 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6323 IEMIMPL_SSE_EPILOGUE
6324 EPILOGUE_4_ARGS
6325ENDPROC iemAImpl_comiss_u128
6326
6327BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6328 PROLOGUE_4_ARGS
6329 IEMIMPL_SSE_PROLOGUE
6330 SSE_LD_FXSTATE_MXCSR_ONLY A0
6331
6332 movdqu xmm0, [A2]
6333 movdqu xmm1, [A3]
6334 vcomiss xmm0, xmm1
6335 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6336
6337 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6338 IEMIMPL_SSE_EPILOGUE
6339 EPILOGUE_4_ARGS
6340ENDPROC iemAImpl_vcomiss_u128
6341
6342
6343;
6344; COMISD (SSE)
6345;
6346; @param A0 Pointer to the MXCSR value (input/output).
6347; @param A1 Pointer to the EFLAGS value (input/output).
6348; @param A2 Pointer to the first source operand (aka readonly destination).
6349; @param A3 Pointer to the second source operand.
6350;
6351BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6352 PROLOGUE_4_ARGS
6353 IEMIMPL_SSE_PROLOGUE
6354 SSE_LD_FXSTATE_MXCSR_ONLY A0
6355
6356 movdqu xmm0, [A2]
6357 movdqu xmm1, [A3]
6358 comisd xmm0, xmm1
6359 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6360
6361 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6362 IEMIMPL_SSE_EPILOGUE
6363 EPILOGUE_4_ARGS
6364ENDPROC iemAImpl_comisd_u128
6365
6366BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6367 PROLOGUE_4_ARGS
6368 IEMIMPL_SSE_PROLOGUE
6369 SSE_LD_FXSTATE_MXCSR_ONLY A0
6370
6371 movdqu xmm0, [A2]
6372 movdqu xmm1, [A3]
6373 vcomisd xmm0, xmm1
6374 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6375
6376 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6377 IEMIMPL_SSE_EPILOGUE
6378 EPILOGUE_4_ARGS
6379ENDPROC iemAImpl_vcomisd_u128
6380
6381
6382;;
6383; Need to move this as well somewhere better?
6384;
6385struc IEMMEDIAF2XMMSRC
6386 .uSrc1 resd 4
6387 .uSrc2 resd 4
6388endstruc
6389
6390
6391;
6392; CMPPS (SSE)
6393;
6394; @param A0 Pointer to the MXCSR value (input/output).
6395; @param A1 Pointer to the first media register size operand (output).
6396; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6397; @param A3 The 8-bit immediate (input).
6398;
6399BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6400 PROLOGUE_4_ARGS
6401 IEMIMPL_SSE_PROLOGUE
6402 SSE_LD_FXSTATE_MXCSR_ONLY A0
6403
6404 movzx A3, A3_8 ; must clear top bits
6405 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6406 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6407 lea T1, [.imm0 xWrtRIP]
6408 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6409 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6410 %else
6411 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6412 %endif
6413 lea T1, [T1 + T0]
6414 IBT_NOTRACK
6415 call T1
6416 movdqu [A1], xmm0
6417
6418 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6419 IEMIMPL_SSE_EPILOGUE
6420 EPILOGUE_4_ARGS
6421 %assign bImm 0
6422 %rep 256
6423.imm %+ bImm:
6424 IBT_ENDBRxx_WITHOUT_NOTRACK
6425 cmpps xmm0, xmm1, bImm
6426 ret
6427 %assign bImm bImm + 1
6428 %endrep
6429.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6430ENDPROC iemAImpl_cmpps_u128
6431
6432;;
6433; SSE instructions with 8-bit immediates of the form
6434; xxx xmm1, xmm2, imm8.
6435; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6436; register.
6437;
6438; @param 1 The instruction name.
6439;
6440; @param A0 Pointer to the MXCSR value (input/output).
6441; @param A1 Pointer to the first media register size operand (output).
6442; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6443; @param A3 The 8-bit immediate (input).
6444;
6445%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6446BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6447 PROLOGUE_4_ARGS
6448 IEMIMPL_SSE_PROLOGUE
6449 SSE_LD_FXSTATE_MXCSR_ONLY A0
6450
6451 movzx A3, A3_8 ; must clear top bits
6452 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6453 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6454 lea T1, [.imm0 xWrtRIP]
6455 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6456 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6457 %else
6458 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6459 %endif
6460 lea T1, [T1 + T0*2]
6461 IBT_NOTRACK
6462 call T1
6463 movdqu [A1], xmm0
6464
6465 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6466 IEMIMPL_SSE_EPILOGUE
6467 EPILOGUE_4_ARGS
6468 %assign bImm 0
6469 %rep 256
6470.imm %+ bImm:
6471 IBT_ENDBRxx_WITHOUT_NOTRACK
6472 %1 xmm0, xmm1, bImm
6473 ret
6474 %assign bImm bImm + 1
6475 %endrep
6476.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6477ENDPROC iemAImpl_ %+ %1 %+ _u128
6478%endmacro
6479
6480IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6481IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6482IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6483
6484;;
6485; SSE instructions with 8-bit immediates of the form
6486; xxx xmm1, xmm2, imm8.
6487; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6488; register.
6489;
6490; @param 1 The instruction name.
6491;
6492; @param A0 Pointer to the MXCSR value (input/output).
6493; @param A1 Pointer to the first media register size operand (output).
6494; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6495; @param A3 The 8-bit immediate (input).
6496;
6497%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6498BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6499 PROLOGUE_4_ARGS
6500 IEMIMPL_SSE_PROLOGUE
6501 SSE_LD_FXSTATE_MXCSR_ONLY A0
6502
6503 movzx A3, A3_8 ; must clear top bits
6504 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6505 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6506 lea T1, [.imm0 xWrtRIP]
6507 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6508 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6509 lea T1, [T1 + T0*4]
6510 %else
6511 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6512 %endif
6513 IBT_NOTRACK
6514 call T1
6515 movdqu [A1], xmm0
6516
6517 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6518 IEMIMPL_SSE_EPILOGUE
6519 EPILOGUE_4_ARGS
6520 %assign bImm 0
6521 %rep 256
6522.imm %+ bImm:
6523 IBT_ENDBRxx_WITHOUT_NOTRACK
6524 %1 xmm0, xmm1, bImm
6525 ret
6526 int3
6527 %assign bImm bImm + 1
6528 %endrep
6529.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6530ENDPROC iemAImpl_ %+ %1 %+ _u128
6531%endmacro
6532
6533IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6534IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6535IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6536IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6537IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6538IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6539
6540
6541;;
6542; SSE instructions of the form
6543; xxx mm, xmm.
6544; and we need to load and save the MXCSR register.
6545;
6546; @param 1 The instruction name.
6547;
6548; @param A0 Pointer to the MXCSR value (input/output).
6549; @param A1 Pointer to the first MMX register sized operand (output).
6550; @param A2 Pointer to the media register sized operand (input).
6551;
6552%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6553BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6554 PROLOGUE_3_ARGS
6555 IEMIMPL_SSE_PROLOGUE
6556 SSE_LD_FXSTATE_MXCSR_ONLY A0
6557
6558 movdqu xmm0, [A2]
6559 %1 mm0, xmm0
6560 movq [A1], mm0
6561
6562 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6563 IEMIMPL_SSE_EPILOGUE
6564 EPILOGUE_3_ARGS
6565ENDPROC iemAImpl_ %+ %1 %+ _u128
6566%endmacro
6567
6568IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6569IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6570
6571;;
6572; SSE instructions of the form
6573; xxx xmm, xmm/m64.
6574; and we need to load and save the MXCSR register.
6575;
6576; @param 1 The instruction name.
6577;
6578; @param A0 Pointer to the MXCSR value (input/output).
6579; @param A1 Pointer to the first media register sized operand (input/output).
6580; @param A2 The 64bit source value from a MMX media register (input)
6581;
6582%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6583BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6584 PROLOGUE_3_ARGS
6585 IEMIMPL_SSE_PROLOGUE
6586 SSE_LD_FXSTATE_MXCSR_ONLY A0
6587
6588 movdqu xmm0, [A1]
6589 movq mm0, A2
6590 %1 xmm0, mm0
6591 movdqu [A1], xmm0
6592
6593 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6594 IEMIMPL_SSE_EPILOGUE
6595 EPILOGUE_3_ARGS
6596ENDPROC iemAImpl_ %+ %1 %+ _u128
6597%endmacro
6598
6599IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6600IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6601
6602;;
6603; SSE instructions of the form
6604; xxx mm, xmm/m64.
6605; and we need to load and save the MXCSR register.
6606;
6607; @param 1 The instruction name.
6608;
6609; @param A0 Pointer to the MXCSR value (input/output).
6610; @param A1 Pointer to the first MMX media register sized operand (output).
6611; @param A2 The 64bit source value (input).
6612;
6613%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6614BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6615 PROLOGUE_3_ARGS
6616 IEMIMPL_SSE_PROLOGUE
6617 SSE_LD_FXSTATE_MXCSR_ONLY A0
6618
6619 movq xmm0, A2
6620 %1 mm0, xmm0
6621 movq [A1], mm0
6622
6623 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6624 IEMIMPL_SSE_EPILOGUE
6625 EPILOGUE_3_ARGS
6626ENDPROC iemAImpl_ %+ %1 %+ _u128
6627%endmacro
6628
6629IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6630IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6631
6632;
6633; All forms of RDRAND and RDSEED
6634;
6635; @param A0 Pointer to the destination operand.
6636; @param A1 Pointer to the EFLAGS value (input/output).
6637;
6638%macro IEMIMPL_RDRAND_RDSEED 3
6639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6640 PROLOGUE_2_ARGS
6641
6642 %1 %2
6643 mov [A0], %2
6644 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6645
6646 EPILOGUE_2_ARGS
6647ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6648%endmacro
6649
6650IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6651IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6652IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6653IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6654IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6655IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6656
6657
6658;;
6659; sha1rnds4 xmm1, xmm2, imm8.
6660;
6661; @param 1 The instruction name.
6662;
6663; @param A0 Pointer to the first media register size operand (input/output).
6664; @param A1 Pointer to the second source media register size operand (input).
6665; @param A2 The 8-bit immediate
6666;
6667BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6668 PROLOGUE_3_ARGS
6669 IEMIMPL_SSE_PROLOGUE
6670
6671 movzx A2, A2_8 ; must clear top bits
6672 movdqu xmm0, [A0]
6673 movdqu xmm1, [A1]
6674 lea T1, [.imm0 xWrtRIP]
6675 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6676 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6677 %else
6678 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6679 %endif
6680 lea T1, [T1 + T0*2]
6681 IBT_NOTRACK
6682 call T1
6683 movdqu [A0], xmm0
6684
6685 IEMIMPL_SSE_EPILOGUE
6686 EPILOGUE_3_ARGS
6687 %assign bImm 0
6688 %rep 256
6689.imm %+ bImm:
6690 IBT_ENDBRxx_WITHOUT_NOTRACK
6691 sha1rnds4 xmm0, xmm1, bImm
6692 ret
6693 %assign bImm bImm + 1
6694 %endrep
6695.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6696ENDPROC iemAImpl_sha1rnds4_u128
6697
6698
6699;;
6700; sha256rnds2 xmm1, xmm2, <XMM0>.
6701;
6702; @param 1 The instruction name.
6703;
6704; @param A0 Pointer to the first media register size operand (input/output).
6705; @param A1 Pointer to the second source media register size operand (input).
6706; @param A2 Pointer to the implicit XMM0 constants (input).
6707;
6708BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6709 PROLOGUE_3_ARGS
6710 IEMIMPL_SSE_PROLOGUE
6711
6712 movdqu xmm0, [A2]
6713 movdqu xmm1, [A0]
6714 movdqu xmm2, [A1]
6715 sha256rnds2 xmm1, xmm2
6716 movdqu [A0], xmm1
6717
6718 IEMIMPL_SSE_EPILOGUE
6719 EPILOGUE_3_ARGS
6720ENDPROC iemAImpl_sha256rnds2_u128
6721
6722
6723;
6724; 32-bit forms of ADCX and ADOX
6725;
6726; @param A0 Pointer to the destination operand (input/output).
6727; @param A1 Pointer to the EFLAGS value (input/output).
6728; @param A2 32-bit source operand 1 (input).
6729;
6730%macro IEMIMPL_ADX_32 2
6731BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6732 PROLOGUE_4_ARGS
6733
6734 IEM_LOAD_FLAGS A1, %2, 0
6735 %1 A2_32, [A0]
6736 mov [A0], A2_32
6737 IEM_SAVE_FLAGS A1, %2, 0
6738
6739 EPILOGUE_4_ARGS
6740ENDPROC iemAImpl_ %+ %1 %+ _u32
6741%endmacro
6742
6743;
6744; 64-bit forms of ADCX and ADOX
6745;
6746; @param A0 Pointer to the destination operand (input/output).
6747; @param A1 Pointer to the EFLAGS value (input/output).
6748; @param A2 64-bit source operand 1 (input).
6749;
6750%macro IEMIMPL_ADX_64 2
6751BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6752 PROLOGUE_4_ARGS
6753
6754 IEM_LOAD_FLAGS A1, %2, 0
6755 %1 A2, [A0]
6756 mov [A0], A2
6757 IEM_SAVE_FLAGS A1, %2, 0
6758
6759 EPILOGUE_4_ARGS
6760ENDPROC iemAImpl_ %+ %1 %+ _u64
6761%endmacro
6762
6763IEMIMPL_ADX_32 adcx, X86_EFL_CF
6764IEMIMPL_ADX_64 adcx, X86_EFL_CF
6765
6766IEMIMPL_ADX_32 adox, X86_EFL_OF
6767IEMIMPL_ADX_64 adox, X86_EFL_OF
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette