VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 103720

最後變更 在這個檔案從103720是 103700,由 vboxsync 提交於 13 月 前

VMM/IEM: Implement vpblendd instruction dispatch & emulation, bugref:9898

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 198.4 KB
 
1; $Id: IEMAllAImpl.asm 103700 2024-03-06 13:32:01Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.alldomusa.eu.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78GLOBALNAME_RAW NAME_FASTCALL(%1,%2,@), function, hidden
79 IBT_ENDBRxx
80%endmacro
81
82
83;
84; We employ some macro assembly here to hid the calling convention differences.
85;
86%ifdef RT_ARCH_AMD64
87 %macro PROLOGUE_1_ARGS 0
88 %endmacro
89 %macro EPILOGUE_1_ARGS 0
90 ret
91 %endmacro
92 %macro EPILOGUE_1_ARGS_EX 0
93 ret
94 %endmacro
95
96 %macro PROLOGUE_2_ARGS 0
97 %endmacro
98 %macro EPILOGUE_2_ARGS 0
99 ret
100 %endmacro
101 %macro EPILOGUE_2_ARGS_EX 1
102 ret
103 %endmacro
104
105 %macro PROLOGUE_3_ARGS 0
106 %endmacro
107 %macro EPILOGUE_3_ARGS 0
108 ret
109 %endmacro
110 %macro EPILOGUE_3_ARGS_EX 1
111 ret
112 %endmacro
113
114 %macro PROLOGUE_4_ARGS 0
115 %endmacro
116 %macro EPILOGUE_4_ARGS 0
117 ret
118 %endmacro
119 %macro EPILOGUE_4_ARGS_EX 1
120 ret
121 %endmacro
122
123 %ifdef ASM_CALL64_GCC
124 %define A0 rdi
125 %define A0_32 edi
126 %define A0_16 di
127 %define A0_8 dil
128
129 %define A1 rsi
130 %define A1_32 esi
131 %define A1_16 si
132 %define A1_8 sil
133
134 %define A2 rdx
135 %define A2_32 edx
136 %define A2_16 dx
137 %define A2_8 dl
138
139 %define A3 rcx
140 %define A3_32 ecx
141 %define A3_16 cx
142 %define A3_8 cl
143 %endif
144
145 %ifdef ASM_CALL64_MSC
146 %define A0 rcx
147 %define A0_32 ecx
148 %define A0_16 cx
149 %define A0_8 cl
150
151 %define A1 rdx
152 %define A1_32 edx
153 %define A1_16 dx
154 %define A1_8 dl
155
156 %define A2 r8
157 %define A2_32 r8d
158 %define A2_16 r8w
159 %define A2_8 r8b
160
161 %define A3 r9
162 %define A3_32 r9d
163 %define A3_16 r9w
164 %define A3_8 r9b
165 %endif
166
167 %define T0 rax
168 %define T0_32 eax
169 %define T0_16 ax
170 %define T0_8 al
171
172 %define T1 r11
173 %define T1_32 r11d
174 %define T1_16 r11w
175 %define T1_8 r11b
176
177 %define T2 r10 ; only AMD64
178 %define T2_32 r10d
179 %define T2_16 r10w
180 %define T2_8 r10b
181
182%else
183 ; x86
184 %macro PROLOGUE_1_ARGS 0
185 push edi
186 %endmacro
187 %macro EPILOGUE_1_ARGS 0
188 pop edi
189 ret 0
190 %endmacro
191 %macro EPILOGUE_1_ARGS_EX 1
192 pop edi
193 ret %1
194 %endmacro
195
196 %macro PROLOGUE_2_ARGS 0
197 push edi
198 %endmacro
199 %macro EPILOGUE_2_ARGS 0
200 pop edi
201 ret 0
202 %endmacro
203 %macro EPILOGUE_2_ARGS_EX 1
204 pop edi
205 ret %1
206 %endmacro
207
208 %macro PROLOGUE_3_ARGS 0
209 push ebx
210 mov ebx, [esp + 4 + 4]
211 push edi
212 %endmacro
213 %macro EPILOGUE_3_ARGS_EX 1
214 %if (%1) < 4
215 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
216 %endif
217 pop edi
218 pop ebx
219 ret %1
220 %endmacro
221 %macro EPILOGUE_3_ARGS 0
222 EPILOGUE_3_ARGS_EX 4
223 %endmacro
224
225 %macro PROLOGUE_4_ARGS 0
226 push ebx
227 push edi
228 push esi
229 mov ebx, [esp + 12 + 4 + 0]
230 mov esi, [esp + 12 + 4 + 4]
231 %endmacro
232 %macro EPILOGUE_4_ARGS_EX 1
233 %if (%1) < 8
234 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
235 %endif
236 pop esi
237 pop edi
238 pop ebx
239 ret %1
240 %endmacro
241 %macro EPILOGUE_4_ARGS 0
242 EPILOGUE_4_ARGS_EX 8
243 %endmacro
244
245 %define A0 ecx
246 %define A0_32 ecx
247 %define A0_16 cx
248 %define A0_8 cl
249
250 %define A1 edx
251 %define A1_32 edx
252 %define A1_16 dx
253 %define A1_8 dl
254
255 %define A2 ebx
256 %define A2_32 ebx
257 %define A2_16 bx
258 %define A2_8 bl
259
260 %define A3 esi
261 %define A3_32 esi
262 %define A3_16 si
263
264 %define T0 eax
265 %define T0_32 eax
266 %define T0_16 ax
267 %define T0_8 al
268
269 %define T1 edi
270 %define T1_32 edi
271 %define T1_16 di
272%endif
273
274
275;;
276; Load the relevant flags from [%1] if there are undefined flags (%3).
277;
278; @remarks Clobbers T0, stack. Changes EFLAGS.
279; @param A2 The register pointing to the flags.
280; @param 1 The parameter (A0..A3) pointing to the eflags.
281; @param 2 The set of modified flags.
282; @param 3 The set of undefined flags.
283; @param 4 Force loading the flags.
284;
285%macro IEM_MAYBE_LOAD_FLAGS 3-4 1
286 %if (%3 + %4) != 0
287 pushf ; store current flags
288 mov T0_32, [%1] ; load the guest flags
289 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
290 and T0_32, (%2 | %3) ; select the modified and undefined flags.
291 or [xSP], T0 ; merge guest flags with host flags.
292 popf ; load the mixed flags.
293 %endif
294%endmacro
295
296;;
297; Load the relevant flags from [%1].
298;
299; @remarks Clobbers T0, stack. Changes EFLAGS.
300; @param A2 The register pointing to the flags.
301; @param 1 The parameter (A0..A3) pointing to the eflags.
302; @param 2 The set of flags to load.
303; @param 3 The set of undefined flags.
304;
305%macro IEM_LOAD_FLAGS 3
306 pushf ; store current flags
307 mov T0_32, [%1] ; load the guest flags
308 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
309 and T0_32, (%2 | %3) ; select the modified and undefined flags.
310 or [xSP], T0 ; merge guest flags with host flags.
311 popf ; load the mixed flags.
312%endmacro
313
314;;
315; Update the flag.
316;
317; @remarks Clobbers T0, T1, stack.
318; @param 1 The register pointing to the EFLAGS.
319; @param 2 The mask of modified flags to save.
320; @param 3 The mask of undefined flags to (maybe) save.
321;
322%macro IEM_SAVE_FLAGS 3
323 %if (%2 | %3) != 0
324 pushf
325 pop T1
326 mov T0_32, [%1] ; flags
327 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
328 and T1_32, (%2 | %3) ; select the modified and undefined flags.
329 or T0_32, T1_32 ; combine the flags.
330 mov [%1], T0_32 ; save the flags.
331 %endif
332%endmacro
333
334;;
335; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
336;
337; @remarks Clobbers T0, T1, stack.
338; @param 1 The register pointing to the EFLAGS.
339; @param 2 The mask of modified flags to save.
340; @param 3 Mask of additional flags to always clear
341; @param 4 Mask of additional flags to always set.
342;
343%macro IEM_SAVE_AND_ADJUST_FLAGS 4
344 %if (%2 | %3 | %4) != 0
345 pushf
346 pop T1
347 mov T0_32, [%1] ; load flags.
348 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
349 and T1_32, (%2) ; select the modified flags.
350 or T0_32, T1_32 ; combine the flags.
351 %if (%4) != 0
352 or T0_32, %4 ; add the always set flags.
353 %endif
354 mov [%1], T0_32 ; save the result.
355 %endif
356%endmacro
357
358;;
359; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
360; signed input (%4[%5]) and parity index (%6).
361;
362; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
363; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
364; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
365;
366; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
367; @param 1 The register pointing to the EFLAGS.
368; @param 2 The mask of modified flags to save.
369; @param 3 Mask of additional flags to always clear
370; @param 4 The result register to set SF by.
371; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
372; @param 6 The (full) register containing the parity table index. Will be modified!
373
374%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
375 %ifdef RT_ARCH_AMD64
376 pushf
377 pop T2
378 %else
379 push T0
380 pushf
381 pop T0
382 %endif
383 mov T1_32, [%1] ; load flags.
384 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
385 %ifdef RT_ARCH_AMD64
386 and T2_32, (%2) ; select the modified flags.
387 or T1_32, T2_32 ; combine the flags.
388 %else
389 and T0_32, (%2) ; select the modified flags.
390 or T1_32, T0_32 ; combine the flags.
391 pop T0
392 %endif
393
394 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
395 bt %4, %5 - 1
396 jnc %%sf_clear
397 or T1_32, X86_EFL_SF
398 %%sf_clear:
399
400 ; Parity last.
401 and %6, 0xff
402 %ifdef RT_ARCH_AMD64
403 lea T2, [NAME(g_afParity) xWrtRIP]
404 or T1_8, [T2 + %6]
405 %else
406 or T1_8, [NAME(g_afParity) + %6]
407 %endif
408
409 mov [%1], T1_32 ; save the result.
410%endmacro
411
412;;
413; Calculates the new EFLAGS using fixed clear and set bit masks.
414;
415; @remarks Clobbers T0.
416; @param 1 The register pointing to the EFLAGS.
417; @param 2 Mask of additional flags to always clear
418; @param 3 Mask of additional flags to always set.
419;
420%macro IEM_ADJUST_FLAGS 3
421 %if (%2 | %3) != 0
422 mov T0_32, [%1] ; Load flags.
423 %if (%2) != 0
424 and T0_32, ~(%2) ; Remove the always cleared flags.
425 %endif
426 %if (%3) != 0
427 or T0_32, %3 ; Add the always set flags.
428 %endif
429 mov [%1], T0_32 ; Save the result.
430 %endif
431%endmacro
432
433;;
434; Calculates the new EFLAGS using fixed clear and set bit masks.
435;
436; @remarks Clobbers T0, %4, EFLAGS.
437; @param 1 The register pointing to the EFLAGS.
438; @param 2 Mask of additional flags to always clear
439; @param 3 Mask of additional flags to always set.
440; @param 4 The (full) register containing the parity table index. Will be modified!
441;
442%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
443 mov T0_32, [%1] ; Load flags.
444 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
445 %if (%3) != 0
446 or T0_32, %3 ; Add the always set flags.
447 %endif
448 and %4, 0xff
449 %ifdef RT_ARCH_AMD64
450 lea T2, [NAME(g_afParity) xWrtRIP]
451 or T0_8, [T2 + %4]
452 %else
453 or T0_8, [NAME(g_afParity) + %4]
454 %endif
455 mov [%1], T0_32 ; Save the result.
456%endmacro
457
458
459;;
460; Checks that the size expression %1 matches %2 adjusted according to
461; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
462; @param 1 The jump array size assembly expression.
463; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
464;
465%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
466 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
467 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
468 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
469 %else
470 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
471 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
472 %endif
473%endmacro
474
475
476;*********************************************************************************************************************************
477;* External Symbols *
478;*********************************************************************************************************************************
479extern NAME(g_afParity)
480
481
482;;
483; Macro for implementing a binary operator.
484;
485; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
486; variants, except on 32-bit system where the 64-bit accesses requires hand
487; coding.
488;
489; All the functions takes a pointer to the destination memory operand in A0,
490; the source register operand in A1 and a pointer to eflags in A2.
491;
492; @param 1 The instruction mnemonic.
493; @param 2 Non-zero if there should be a locked version.
494; @param 3 The modified flags.
495; @param 4 The undefined flags.
496; @param 5 Force flag loading (ADC, SBC).
497;
498%macro IEMIMPL_BIN_OP 5
499BEGINCODE
500BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
501 PROLOGUE_3_ARGS
502 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
503 %1 byte [A0], A1_8
504 IEM_SAVE_FLAGS A2, %3, %4
505 EPILOGUE_3_ARGS
506ENDPROC iemAImpl_ %+ %1 %+ _u8
507
508BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
509 PROLOGUE_3_ARGS
510 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
511 %1 word [A0], A1_16
512 IEM_SAVE_FLAGS A2, %3, %4
513 EPILOGUE_3_ARGS
514ENDPROC iemAImpl_ %+ %1 %+ _u16
515
516BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
517 PROLOGUE_3_ARGS
518 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
519 %1 dword [A0], A1_32
520 IEM_SAVE_FLAGS A2, %3, %4
521 EPILOGUE_3_ARGS
522ENDPROC iemAImpl_ %+ %1 %+ _u32
523
524 %ifdef RT_ARCH_AMD64
525BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
526 PROLOGUE_3_ARGS
527 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
528 %1 qword [A0], A1
529 IEM_SAVE_FLAGS A2, %3, %4
530 EPILOGUE_3_ARGS_EX 8
531ENDPROC iemAImpl_ %+ %1 %+ _u64
532 %endif ; RT_ARCH_AMD64
533
534 %if %2 != 0 ; locked versions requested?
535
536BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
537 PROLOGUE_3_ARGS
538 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
539 lock %1 byte [A0], A1_8
540 IEM_SAVE_FLAGS A2, %3, %4
541 EPILOGUE_3_ARGS
542ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
543
544BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
545 PROLOGUE_3_ARGS
546 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
547 lock %1 word [A0], A1_16
548 IEM_SAVE_FLAGS A2, %3, %4
549 EPILOGUE_3_ARGS
550ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
551
552BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
553 PROLOGUE_3_ARGS
554 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
555 lock %1 dword [A0], A1_32
556 IEM_SAVE_FLAGS A2, %3, %4
557 EPILOGUE_3_ARGS
558ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
559
560 %ifdef RT_ARCH_AMD64
561BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
562 PROLOGUE_3_ARGS
563 IEM_MAYBE_LOAD_FLAGS A2, %3, %4, %5
564 lock %1 qword [A0], A1
565 IEM_SAVE_FLAGS A2, %3, %4
566 EPILOGUE_3_ARGS_EX 8
567ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
568 %endif ; RT_ARCH_AMD64
569 %endif ; locked
570%endmacro
571
572; instr,lock, modified-flags, undefined flags, force loading flags
573IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
574IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
575IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
576IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 1
577IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
578IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
579IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
580IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
581IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF, 0
582
583
584;;
585; Macro for implementing a binary operator, VEX variant with separate input/output.
586;
587; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
588; where the 64-bit accesses requires hand coding.
589;
590; All the functions takes a pointer to the destination memory operand in A0,
591; the first source register operand in A1, the second source register operand
592; in A2 and a pointer to eflags in A3.
593;
594; @param 1 The instruction mnemonic.
595; @param 2 The modified flags.
596; @param 3 The undefined flags.
597;
598%macro IEMIMPL_VEX_BIN_OP 3
599BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
600 PROLOGUE_4_ARGS
601 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
602 %1 T0_32, A1_32, A2_32
603 mov [A0], T0_32
604 IEM_SAVE_FLAGS A3, %2, %3
605 EPILOGUE_4_ARGS
606ENDPROC iemAImpl_ %+ %1 %+ _u32
607
608 %ifdef RT_ARCH_AMD64
609BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
610 PROLOGUE_4_ARGS
611 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
612 %1 T0, A1, A2
613 mov [A0], T0
614 IEM_SAVE_FLAGS A3, %2, %3
615 EPILOGUE_4_ARGS
616ENDPROC iemAImpl_ %+ %1 %+ _u64
617 %endif ; RT_ARCH_AMD64
618%endmacro
619
620; instr, modified-flags, undefined-flags
621IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
622IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
623IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
624
625;;
626; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
627;
628; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
629; where the 64-bit accesses requires hand coding.
630;
631; All the functions takes a pointer to the destination memory operand in A0,
632; the source register operand in A1 and a pointer to eflags in A2.
633;
634; @param 1 The instruction mnemonic.
635; @param 2 The modified flags.
636; @param 3 The undefined flags.
637;
638%macro IEMIMPL_VEX_BIN_OP_2 3
639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
640 PROLOGUE_4_ARGS
641 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
642 mov T0_32, [A0]
643 %1 T0_32, A1_32
644 mov [A0], T0_32
645 IEM_SAVE_FLAGS A2, %2, %3
646 EPILOGUE_4_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u32
648
649 %ifdef RT_ARCH_AMD64
650BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
651 PROLOGUE_4_ARGS
652 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
653 mov T0, [A0]
654 %1 T0, A1
655 mov [A0], T0
656 IEM_SAVE_FLAGS A2, %2, %3
657 EPILOGUE_4_ARGS
658ENDPROC iemAImpl_ %+ %1 %+ _u64
659 %endif ; RT_ARCH_AMD64
660%endmacro
661
662; instr, modified-flags, undefined-flags
663IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
664IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
665IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
666
667
668;;
669; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
670;
671; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
672; where the 64-bit accesses requires hand coding.
673;
674; All the functions takes a pointer to the destination memory operand in A0,
675; the first source register operand in A1, the second source register operand
676; in A2 and a pointer to eflags in A3.
677;
678; @param 1 The instruction mnemonic.
679; @param 2 Fallback instruction if applicable.
680; @param 3 Whether to emit fallback or not.
681;
682%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
683BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
684 PROLOGUE_3_ARGS
685 %1 T0_32, A1_32, A2_32
686 mov [A0], T0_32
687 EPILOGUE_3_ARGS
688ENDPROC iemAImpl_ %+ %1 %+ _u32
689
690 %if %3
691BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
692 PROLOGUE_3_ARGS
693 %ifdef ASM_CALL64_GCC
694 mov cl, A2_8
695 %2 A1_32, cl
696 mov [A0], A1_32
697 %else
698 xchg A2, A0
699 %2 A1_32, cl
700 mov [A2], A1_32
701 %endif
702 EPILOGUE_3_ARGS
703ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
704 %endif
705
706 %ifdef RT_ARCH_AMD64
707BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
708 PROLOGUE_3_ARGS
709 %1 T0, A1, A2
710 mov [A0], T0
711 EPILOGUE_3_ARGS
712ENDPROC iemAImpl_ %+ %1 %+ _u64
713
714 %if %3
715BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
716 PROLOGUE_3_ARGS
717 %ifdef ASM_CALL64_GCC
718 mov cl, A2_8
719 %2 A1, cl
720 mov [A0], A1_32
721 %else
722 xchg A2, A0
723 %2 A1, cl
724 mov [A2], A1_32
725 %endif
726 mov [A0], A1
727 EPILOGUE_3_ARGS
728ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
729 %endif
730 %endif ; RT_ARCH_AMD64
731%endmacro
732
733; instr, fallback instr, emit fallback
734IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
735IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
736IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
737IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
738IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
739
740
741;
742; RORX uses a immediate byte for the shift count, so we only do
743; fallback implementation of that one.
744;
745BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
746 PROLOGUE_3_ARGS
747 %ifdef ASM_CALL64_GCC
748 mov cl, A2_8
749 ror A1_32, cl
750 mov [A0], A1_32
751 %else
752 xchg A2, A0
753 ror A1_32, cl
754 mov [A2], A1_32
755 %endif
756 EPILOGUE_3_ARGS
757ENDPROC iemAImpl_rorx_u32
758
759 %ifdef RT_ARCH_AMD64
760BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
761 PROLOGUE_3_ARGS
762 %ifdef ASM_CALL64_GCC
763 mov cl, A2_8
764 ror A1, cl
765 mov [A0], A1
766 %else
767 xchg A2, A0
768 ror A1, cl
769 mov [A2], A1
770 %endif
771 EPILOGUE_3_ARGS
772ENDPROC iemAImpl_rorx_u64
773 %endif ; RT_ARCH_AMD64
774
775
776;
777; MULX
778;
779BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
780 PROLOGUE_4_ARGS
781%ifdef ASM_CALL64_GCC
782 ; A2_32 is EDX - prefect
783 mulx T0_32, T1_32, A3_32
784 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
785 mov [A0], T0_32
786%else
787 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
788 xchg A1, A2
789 mulx T0_32, T1_32, A3_32
790 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
791 mov [A0], T0_32
792%endif
793 EPILOGUE_4_ARGS
794ENDPROC iemAImpl_mulx_u32
795
796
797BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
798 PROLOGUE_4_ARGS
799%ifdef ASM_CALL64_GCC
800 ; A2_32 is EDX, T0_32 is EAX
801 mov eax, A3_32
802 mul A2_32
803 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
804 mov [A0], edx
805%else
806 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
807 xchg A1, A2
808 mov eax, A3_32
809 mul A2_32
810 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%endif
813 EPILOGUE_4_ARGS
814ENDPROC iemAImpl_mulx_u32_fallback
815
816%ifdef RT_ARCH_AMD64
817BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
818 PROLOGUE_4_ARGS
819%ifdef ASM_CALL64_GCC
820 ; A2 is RDX - prefect
821 mulx T0, T1, A3
822 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
823 mov [A0], T0
824%else
825 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
826 xchg A1, A2
827 mulx T0, T1, A3
828 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
829 mov [A0], T0
830%endif
831 EPILOGUE_4_ARGS
832ENDPROC iemAImpl_mulx_u64
833
834
835BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
836 PROLOGUE_4_ARGS
837%ifdef ASM_CALL64_GCC
838 ; A2 is RDX, T0 is RAX
839 mov rax, A3
840 mul A2
841 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
842 mov [A0], rdx
843%else
844 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
845 xchg A1, A2
846 mov rax, A3
847 mul A2
848 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%endif
851 EPILOGUE_4_ARGS
852ENDPROC iemAImpl_mulx_u64_fallback
853
854%endif
855
856
857;;
858; Macro for implementing a bit operator.
859;
860; This will generate code for the 16, 32 and 64 bit accesses with locked
861; variants, except on 32-bit system where the 64-bit accesses requires hand
862; coding.
863;
864; All the functions takes a pointer to the destination memory operand in A0,
865; the source register operand in A1 and a pointer to eflags in A2.
866;
867; @param 1 The instruction mnemonic.
868; @param 2 Non-zero if there should be a locked version.
869; @param 3 The modified flags.
870; @param 4 The undefined flags.
871;
872%macro IEMIMPL_BIT_OP 4
873BEGINCODE
874BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
875 PROLOGUE_3_ARGS
876 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
877 %1 word [A0], A1_16
878 IEM_SAVE_FLAGS A2, %3, %4
879 EPILOGUE_3_ARGS
880ENDPROC iemAImpl_ %+ %1 %+ _u16
881
882BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
883 PROLOGUE_3_ARGS
884 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
885 %1 dword [A0], A1_32
886 IEM_SAVE_FLAGS A2, %3, %4
887 EPILOGUE_3_ARGS
888ENDPROC iemAImpl_ %+ %1 %+ _u32
889
890 %ifdef RT_ARCH_AMD64
891BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
892 PROLOGUE_3_ARGS
893 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
894 %1 qword [A0], A1
895 IEM_SAVE_FLAGS A2, %3, %4
896 EPILOGUE_3_ARGS_EX 8
897ENDPROC iemAImpl_ %+ %1 %+ _u64
898 %endif ; RT_ARCH_AMD64
899
900 %if %2 != 0 ; locked versions requested?
901
902BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
903 PROLOGUE_3_ARGS
904 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
905 lock %1 word [A0], A1_16
906 IEM_SAVE_FLAGS A2, %3, %4
907 EPILOGUE_3_ARGS
908ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
909
910BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
911 PROLOGUE_3_ARGS
912 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
913 lock %1 dword [A0], A1_32
914 IEM_SAVE_FLAGS A2, %3, %4
915 EPILOGUE_3_ARGS
916ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
917
918 %ifdef RT_ARCH_AMD64
919BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
920 PROLOGUE_3_ARGS
921 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
922 lock %1 qword [A0], A1
923 IEM_SAVE_FLAGS A2, %3, %4
924 EPILOGUE_3_ARGS_EX 8
925ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
926 %endif ; RT_ARCH_AMD64
927 %endif ; locked
928%endmacro
929; modified efl, undefined eflags
930IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
931IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
932IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
933IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
934
935;;
936; Macro for implementing a bit search operator.
937;
938; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
939; system where the 64-bit accesses requires hand coding.
940;
941; All the functions takes a pointer to the destination memory operand in A0,
942; the source register operand in A1 and a pointer to eflags in A2.
943;
944; In the ZF case the destination register is 'undefined', however it seems that
945; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
946; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
947; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
948; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
949;
950; @param 1 The instruction mnemonic.
951; @param 2 The modified flags.
952; @param 3 The undefined flags.
953; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
954;
955%macro IEMIMPL_BIT_OP2 4
956BEGINCODE
957BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
958 PROLOGUE_3_ARGS
959 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
960 %1 T0_16, A1_16
961%if %4 != 0
962 jz .unchanged_dst
963%endif
964 mov [A0], T0_16
965.unchanged_dst:
966 IEM_SAVE_FLAGS A2, %2, %3
967 EPILOGUE_3_ARGS
968ENDPROC iemAImpl_ %+ %1 %+ _u16
969
970BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
971 PROLOGUE_3_ARGS
972 %1 T1_16, A1_16
973%if %4 != 0
974 jz .unchanged_dst
975%endif
976 mov [A0], T1_16
977 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
978 EPILOGUE_3_ARGS
979.unchanged_dst:
980 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
981 EPILOGUE_3_ARGS
982ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
983
984BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
985 PROLOGUE_3_ARGS
986 %1 T0_16, A1_16
987%if %4 != 0
988 jz .unchanged_dst
989%endif
990 mov [A0], T0_16
991.unchanged_dst:
992 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
993 EPILOGUE_3_ARGS
994ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
995
996
997BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
998 PROLOGUE_3_ARGS
999 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1000 %1 T0_32, A1_32
1001%if %4 != 0
1002 jz .unchanged_dst
1003%endif
1004 mov [A0], T0_32
1005.unchanged_dst:
1006 IEM_SAVE_FLAGS A2, %2, %3
1007 EPILOGUE_3_ARGS
1008ENDPROC iemAImpl_ %+ %1 %+ _u32
1009
1010BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1011 PROLOGUE_3_ARGS
1012 %1 T1_32, A1_32
1013%if %4 != 0
1014 jz .unchanged_dst
1015%endif
1016 mov [A0], T1_32
1017 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1018 EPILOGUE_3_ARGS
1019.unchanged_dst:
1020 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1021 EPILOGUE_3_ARGS
1022ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1023
1024BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1025 PROLOGUE_3_ARGS
1026 %1 T0_32, A1_32
1027%if %4 != 0
1028 jz .unchanged_dst
1029%endif
1030 mov [A0], T0_32
1031.unchanged_dst:
1032 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1033 EPILOGUE_3_ARGS
1034ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1035
1036
1037 %ifdef RT_ARCH_AMD64
1038
1039BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1040 PROLOGUE_3_ARGS
1041 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1042 %1 T0, A1
1043%if %4 != 0
1044 jz .unchanged_dst
1045%endif
1046 mov [A0], T0
1047.unchanged_dst:
1048 IEM_SAVE_FLAGS A2, %2, %3
1049 EPILOGUE_3_ARGS_EX 8
1050ENDPROC iemAImpl_ %+ %1 %+ _u64
1051
1052BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1053 PROLOGUE_3_ARGS
1054 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1055 %1 T1, A1
1056%if %4 != 0
1057 jz .unchanged_dst
1058%endif
1059 mov [A0], T1
1060 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1061 EPILOGUE_3_ARGS
1062.unchanged_dst:
1063 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1064 EPILOGUE_3_ARGS
1065ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1066
1067BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1068 PROLOGUE_3_ARGS
1069 %1 T0, A1
1070%if %4 != 0
1071 jz .unchanged_dst
1072%endif
1073 mov [A0], T0
1074.unchanged_dst:
1075 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1076 EPILOGUE_3_ARGS_EX 8
1077ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1078
1079 %endif ; RT_ARCH_AMD64
1080%endmacro
1081
1082IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1083IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1084IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1085IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1086
1087
1088;;
1089; Macro for implementing POPCNT.
1090;
1091; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1092; system where the 64-bit accesses requires hand coding.
1093;
1094; All the functions takes a pointer to the destination memory operand in A0,
1095; the source register operand in A1 and a pointer to eflags in A2.
1096;
1097; ASSUMES Intel and AMD set EFLAGS the same way.
1098;
1099; ASSUMES the instruction does not support memory destination.
1100;
1101; @param 1 The instruction mnemonic.
1102; @param 2 The modified flags.
1103; @param 3 The undefined flags.
1104;
1105%macro IEMIMPL_BIT_OP3 3
1106BEGINCODE
1107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1108 PROLOGUE_3_ARGS
1109 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1110 %1 T0_16, A1_16
1111 mov [A0], T0_16
1112 IEM_SAVE_FLAGS A2, %2, %3
1113 EPILOGUE_3_ARGS
1114ENDPROC iemAImpl_ %+ %1 %+ _u16
1115
1116BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1117 PROLOGUE_3_ARGS
1118 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1119 %1 T0_32, A1_32
1120 mov [A0], T0_32
1121 IEM_SAVE_FLAGS A2, %2, %3
1122 EPILOGUE_3_ARGS
1123ENDPROC iemAImpl_ %+ %1 %+ _u32
1124
1125 %ifdef RT_ARCH_AMD64
1126BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1127 PROLOGUE_3_ARGS
1128 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, 0
1129 %1 T0, A1
1130 mov [A0], T0
1131 IEM_SAVE_FLAGS A2, %2, %3
1132 EPILOGUE_3_ARGS_EX 8
1133ENDPROC iemAImpl_ %+ %1 %+ _u64
1134 %endif ; RT_ARCH_AMD64
1135%endmacro
1136IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1137
1138
1139;
1140; IMUL is also a similar but yet different case (no lock, no mem dst).
1141; The rDX:rAX variant of imul is handled together with mul further down.
1142;
1143BEGINCODE
1144; @param 1 EFLAGS that are modified.
1145; @param 2 Undefined EFLAGS.
1146; @param 3 Function suffix.
1147; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1148; 2 for AMD (set AF, clear PF, ZF and SF).
1149%macro IEMIMPL_IMUL_TWO 4
1150BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1151 PROLOGUE_3_ARGS
1152 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1153 imul A1_16, word [A0]
1154 mov [A0], A1_16
1155 %if %4 != 1
1156 IEM_SAVE_FLAGS A2, %1, %2
1157 %else
1158 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1159 %endif
1160 EPILOGUE_3_ARGS
1161ENDPROC iemAImpl_imul_two_u16 %+ %3
1162
1163BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1164 PROLOGUE_3_ARGS
1165 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1166 imul A1_32, dword [A0]
1167 mov [A0], A1_32
1168 %if %4 != 1
1169 IEM_SAVE_FLAGS A2, %1, %2
1170 %else
1171 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1172 %endif
1173 EPILOGUE_3_ARGS
1174ENDPROC iemAImpl_imul_two_u32 %+ %3
1175
1176 %ifdef RT_ARCH_AMD64
1177BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1178 PROLOGUE_3_ARGS
1179 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1180 imul A1, qword [A0]
1181 mov [A0], A1
1182 %if %4 != 1
1183 IEM_SAVE_FLAGS A2, %1, %2
1184 %else
1185 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1186 %endif
1187 EPILOGUE_3_ARGS_EX 8
1188ENDPROC iemAImpl_imul_two_u64 %+ %3
1189 %endif ; RT_ARCH_AMD64
1190%endmacro
1191IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1192IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1193IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1194
1195
1196;
1197; XCHG for memory operands. This implies locking. No flag changes.
1198;
1199; Each function takes two arguments, first the pointer to the memory,
1200; then the pointer to the register. They all return void.
1201;
1202BEGINCODE
1203BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1204 PROLOGUE_2_ARGS
1205 mov T0_8, [A1]
1206 xchg [A0], T0_8
1207 mov [A1], T0_8
1208 EPILOGUE_2_ARGS
1209ENDPROC iemAImpl_xchg_u8_locked
1210
1211BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1212 PROLOGUE_2_ARGS
1213 mov T0_16, [A1]
1214 xchg [A0], T0_16
1215 mov [A1], T0_16
1216 EPILOGUE_2_ARGS
1217ENDPROC iemAImpl_xchg_u16_locked
1218
1219BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1220 PROLOGUE_2_ARGS
1221 mov T0_32, [A1]
1222 xchg [A0], T0_32
1223 mov [A1], T0_32
1224 EPILOGUE_2_ARGS
1225ENDPROC iemAImpl_xchg_u32_locked
1226
1227%ifdef RT_ARCH_AMD64
1228BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0, [A1]
1231 xchg [A0], T0
1232 mov [A1], T0
1233 EPILOGUE_2_ARGS
1234ENDPROC iemAImpl_xchg_u64_locked
1235%endif
1236
1237; Unlocked variants for fDisregardLock mode.
1238
1239BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1240 PROLOGUE_2_ARGS
1241 mov T0_8, [A1]
1242 mov T1_8, [A0]
1243 mov [A0], T0_8
1244 mov [A1], T1_8
1245 EPILOGUE_2_ARGS
1246ENDPROC iemAImpl_xchg_u8_unlocked
1247
1248BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1249 PROLOGUE_2_ARGS
1250 mov T0_16, [A1]
1251 mov T1_16, [A0]
1252 mov [A0], T0_16
1253 mov [A1], T1_16
1254 EPILOGUE_2_ARGS
1255ENDPROC iemAImpl_xchg_u16_unlocked
1256
1257BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1258 PROLOGUE_2_ARGS
1259 mov T0_32, [A1]
1260 mov T1_32, [A0]
1261 mov [A0], T0_32
1262 mov [A1], T1_32
1263 EPILOGUE_2_ARGS
1264ENDPROC iemAImpl_xchg_u32_unlocked
1265
1266%ifdef RT_ARCH_AMD64
1267BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1268 PROLOGUE_2_ARGS
1269 mov T0, [A1]
1270 mov T1, [A0]
1271 mov [A0], T0
1272 mov [A1], T1
1273 EPILOGUE_2_ARGS
1274ENDPROC iemAImpl_xchg_u64_unlocked
1275%endif
1276
1277
1278;
1279; XADD for memory operands.
1280;
1281; Each function takes three arguments, first the pointer to the
1282; memory/register, then the pointer to the register, and finally a pointer to
1283; eflags. They all return void.
1284;
1285BEGINCODE
1286BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1287 PROLOGUE_3_ARGS
1288 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1289 mov T0_8, [A1]
1290 xadd [A0], T0_8
1291 mov [A1], T0_8
1292 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1293 EPILOGUE_3_ARGS
1294ENDPROC iemAImpl_xadd_u8
1295
1296BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1297 PROLOGUE_3_ARGS
1298 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1299 mov T0_16, [A1]
1300 xadd [A0], T0_16
1301 mov [A1], T0_16
1302 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1303 EPILOGUE_3_ARGS
1304ENDPROC iemAImpl_xadd_u16
1305
1306BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1307 PROLOGUE_3_ARGS
1308 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1309 mov T0_32, [A1]
1310 xadd [A0], T0_32
1311 mov [A1], T0_32
1312 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1313 EPILOGUE_3_ARGS
1314ENDPROC iemAImpl_xadd_u32
1315
1316%ifdef RT_ARCH_AMD64
1317BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1318 PROLOGUE_3_ARGS
1319 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1320 mov T0, [A1]
1321 xadd [A0], T0
1322 mov [A1], T0
1323 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1324 EPILOGUE_3_ARGS
1325ENDPROC iemAImpl_xadd_u64
1326%endif ; RT_ARCH_AMD64
1327
1328BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1329 PROLOGUE_3_ARGS
1330 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1331 mov T0_8, [A1]
1332 lock xadd [A0], T0_8
1333 mov [A1], T0_8
1334 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1335 EPILOGUE_3_ARGS
1336ENDPROC iemAImpl_xadd_u8_locked
1337
1338BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1339 PROLOGUE_3_ARGS
1340 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1341 mov T0_16, [A1]
1342 lock xadd [A0], T0_16
1343 mov [A1], T0_16
1344 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1345 EPILOGUE_3_ARGS
1346ENDPROC iemAImpl_xadd_u16_locked
1347
1348BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1351 mov T0_32, [A1]
1352 lock xadd [A0], T0_32
1353 mov [A1], T0_32
1354 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1355 EPILOGUE_3_ARGS
1356ENDPROC iemAImpl_xadd_u32_locked
1357
1358%ifdef RT_ARCH_AMD64
1359BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1360 PROLOGUE_3_ARGS
1361 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, 0
1362 mov T0, [A1]
1363 lock xadd [A0], T0
1364 mov [A1], T0
1365 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1366 EPILOGUE_3_ARGS
1367ENDPROC iemAImpl_xadd_u64_locked
1368%endif ; RT_ARCH_AMD64
1369
1370
1371;
1372; CMPXCHG8B.
1373;
1374; These are tricky register wise, so the code is duplicated for each calling
1375; convention.
1376;
1377; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1378;
1379; C-proto:
1380; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1381; uint32_t *pEFlags));
1382;
1383; Note! Identical to iemAImpl_cmpxchg16b.
1384;
1385BEGINCODE
1386BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1387%ifdef RT_ARCH_AMD64
1388 %ifdef ASM_CALL64_MSC
1389 push rbx
1390
1391 mov r11, rdx ; pu64EaxEdx (is also T1)
1392 mov r10, rcx ; pu64Dst
1393
1394 mov ebx, [r8]
1395 mov ecx, [r8 + 4]
1396 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1397 mov eax, [r11]
1398 mov edx, [r11 + 4]
1399
1400 cmpxchg8b [r10]
1401
1402 mov [r11], eax
1403 mov [r11 + 4], edx
1404 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1405
1406 pop rbx
1407 ret
1408 %else
1409 push rbx
1410
1411 mov r10, rcx ; pEFlags
1412 mov r11, rdx ; pu64EbxEcx (is also T1)
1413
1414 mov ebx, [r11]
1415 mov ecx, [r11 + 4]
1416 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1417 mov eax, [rsi]
1418 mov edx, [rsi + 4]
1419
1420 cmpxchg8b [rdi]
1421
1422 mov [rsi], eax
1423 mov [rsi + 4], edx
1424 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1425
1426 pop rbx
1427 ret
1428
1429 %endif
1430%else
1431 push esi
1432 push edi
1433 push ebx
1434 push ebp
1435
1436 mov edi, ecx ; pu64Dst
1437 mov esi, edx ; pu64EaxEdx
1438 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1439 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1440
1441 mov ebx, [ecx]
1442 mov ecx, [ecx + 4]
1443 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1444 mov eax, [esi]
1445 mov edx, [esi + 4]
1446
1447 cmpxchg8b [edi]
1448
1449 mov [esi], eax
1450 mov [esi + 4], edx
1451 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1452
1453 pop ebp
1454 pop ebx
1455 pop edi
1456 pop esi
1457 ret 8
1458%endif
1459ENDPROC iemAImpl_cmpxchg8b
1460
1461BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1462%ifdef RT_ARCH_AMD64
1463 %ifdef ASM_CALL64_MSC
1464 push rbx
1465
1466 mov r11, rdx ; pu64EaxEdx (is also T1)
1467 mov r10, rcx ; pu64Dst
1468
1469 mov ebx, [r8]
1470 mov ecx, [r8 + 4]
1471 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1472 mov eax, [r11]
1473 mov edx, [r11 + 4]
1474
1475 lock cmpxchg8b [r10]
1476
1477 mov [r11], eax
1478 mov [r11 + 4], edx
1479 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1480
1481 pop rbx
1482 ret
1483 %else
1484 push rbx
1485
1486 mov r10, rcx ; pEFlags
1487 mov r11, rdx ; pu64EbxEcx (is also T1)
1488
1489 mov ebx, [r11]
1490 mov ecx, [r11 + 4]
1491 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1492 mov eax, [rsi]
1493 mov edx, [rsi + 4]
1494
1495 lock cmpxchg8b [rdi]
1496
1497 mov [rsi], eax
1498 mov [rsi + 4], edx
1499 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1500
1501 pop rbx
1502 ret
1503
1504 %endif
1505%else
1506 push esi
1507 push edi
1508 push ebx
1509 push ebp
1510
1511 mov edi, ecx ; pu64Dst
1512 mov esi, edx ; pu64EaxEdx
1513 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1514 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1515
1516 mov ebx, [ecx]
1517 mov ecx, [ecx + 4]
1518 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1519 mov eax, [esi]
1520 mov edx, [esi + 4]
1521
1522 lock cmpxchg8b [edi]
1523
1524 mov [esi], eax
1525 mov [esi + 4], edx
1526 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1527
1528 pop ebp
1529 pop ebx
1530 pop edi
1531 pop esi
1532 ret 8
1533%endif
1534ENDPROC iemAImpl_cmpxchg8b_locked
1535
1536%ifdef RT_ARCH_AMD64
1537
1538;
1539; CMPXCHG16B.
1540;
1541; These are tricky register wise, so the code is duplicated for each calling
1542; convention.
1543;
1544; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1545;
1546; C-proto:
1547; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1548; uint32_t *pEFlags));
1549;
1550; Note! Identical to iemAImpl_cmpxchg8b.
1551;
1552BEGINCODE
1553BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1554 %ifdef ASM_CALL64_MSC
1555 push rbx
1556
1557 mov r11, rdx ; pu64RaxRdx (is also T1)
1558 mov r10, rcx ; pu64Dst
1559
1560 mov rbx, [r8]
1561 mov rcx, [r8 + 8]
1562 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1563 mov rax, [r11]
1564 mov rdx, [r11 + 8]
1565
1566 cmpxchg16b [r10]
1567
1568 mov [r11], rax
1569 mov [r11 + 8], rdx
1570 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1571
1572 pop rbx
1573 ret
1574 %else
1575 push rbx
1576
1577 mov r10, rcx ; pEFlags
1578 mov r11, rdx ; pu64RbxRcx (is also T1)
1579
1580 mov rbx, [r11]
1581 mov rcx, [r11 + 8]
1582 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1583 mov rax, [rsi]
1584 mov rdx, [rsi + 8]
1585
1586 cmpxchg16b [rdi]
1587
1588 mov [rsi], rax
1589 mov [rsi + 8], rdx
1590 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1591
1592 pop rbx
1593 ret
1594
1595 %endif
1596ENDPROC iemAImpl_cmpxchg16b
1597
1598BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1599 %ifdef ASM_CALL64_MSC
1600 push rbx
1601
1602 mov r11, rdx ; pu64RaxRdx (is also T1)
1603 mov r10, rcx ; pu64Dst
1604
1605 mov rbx, [r8]
1606 mov rcx, [r8 + 8]
1607 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1608 mov rax, [r11]
1609 mov rdx, [r11 + 8]
1610
1611 lock cmpxchg16b [r10]
1612
1613 mov [r11], rax
1614 mov [r11 + 8], rdx
1615 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1616
1617 pop rbx
1618 ret
1619 %else
1620 push rbx
1621
1622 mov r10, rcx ; pEFlags
1623 mov r11, rdx ; pu64RbxRcx (is also T1)
1624
1625 mov rbx, [r11]
1626 mov rcx, [r11 + 8]
1627 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0, 0 ; clobbers T0 (eax)
1628 mov rax, [rsi]
1629 mov rdx, [rsi + 8]
1630
1631 lock cmpxchg16b [rdi]
1632
1633 mov [rsi], rax
1634 mov [rsi + 8], rdx
1635 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1636
1637 pop rbx
1638 ret
1639
1640 %endif
1641ENDPROC iemAImpl_cmpxchg16b_locked
1642
1643%endif ; RT_ARCH_AMD64
1644
1645
1646;
1647; CMPXCHG.
1648;
1649; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1650;
1651; C-proto:
1652; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1653;
1654BEGINCODE
1655%macro IEMIMPL_CMPXCHG 2
1656BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1657 PROLOGUE_4_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1659 mov al, [A1]
1660 %1 cmpxchg [A0], A2_8
1661 mov [A1], al
1662 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1663 EPILOGUE_4_ARGS
1664ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1665
1666BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1667 PROLOGUE_4_ARGS
1668 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1669 mov ax, [A1]
1670 %1 cmpxchg [A0], A2_16
1671 mov [A1], ax
1672 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1673 EPILOGUE_4_ARGS
1674ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1675
1676BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1677 PROLOGUE_4_ARGS
1678 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1679 mov eax, [A1]
1680 %1 cmpxchg [A0], A2_32
1681 mov [A1], eax
1682 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1683 EPILOGUE_4_ARGS
1684ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1685
1686BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1687%ifdef RT_ARCH_AMD64
1688 PROLOGUE_4_ARGS
1689 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1690 mov rax, [A1]
1691 %1 cmpxchg [A0], A2
1692 mov [A1], rax
1693 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1694 EPILOGUE_4_ARGS
1695%else
1696 ;
1697 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1698 ;
1699 push esi
1700 push edi
1701 push ebx
1702 push ebp
1703
1704 mov edi, ecx ; pu64Dst
1705 mov esi, edx ; pu64Rax
1706 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1707 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1708
1709 mov ebx, [ecx]
1710 mov ecx, [ecx + 4]
1711 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0, 0 ; clobbers T0 (eax)
1712 mov eax, [esi]
1713 mov edx, [esi + 4]
1714
1715 lock cmpxchg8b [edi]
1716
1717 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1718 jz .cmpxchg8b_not_equal
1719;; @todo this isn't correct. Need to do a 64-bit compare, not just the lower 32-bit.
1720 cmp eax, eax ; just set the other flags.
1721.store:
1722 mov [esi], eax
1723 mov [esi + 4], edx
1724 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1725
1726 pop ebp
1727 pop ebx
1728 pop edi
1729 pop esi
1730 ret 8
1731
1732.cmpxchg8b_not_equal:
1733 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1734 jne .store
1735 cmp [esi], eax
1736 jmp .store
1737
1738%endif
1739ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1740%endmacro ; IEMIMPL_CMPXCHG
1741
1742IEMIMPL_CMPXCHG , ,
1743IEMIMPL_CMPXCHG lock, _locked
1744
1745;;
1746; Macro for implementing a unary operator.
1747;
1748; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1749; variants, except on 32-bit system where the 64-bit accesses requires hand
1750; coding.
1751;
1752; All the functions takes a pointer to the destination memory operand in A0,
1753; the source register operand in A1 and a pointer to eflags in A2.
1754;
1755; @param 1 The instruction mnemonic.
1756; @param 2 The modified flags.
1757; @param 3 The undefined flags.
1758;
1759%macro IEMIMPL_UNARY_OP 3
1760BEGINCODE
1761BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1762 PROLOGUE_2_ARGS
1763 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1764 %1 byte [A0]
1765 IEM_SAVE_FLAGS A1, %2, %3
1766 EPILOGUE_2_ARGS
1767ENDPROC iemAImpl_ %+ %1 %+ _u8
1768
1769BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1770 PROLOGUE_2_ARGS
1771 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1772 lock %1 byte [A0]
1773 IEM_SAVE_FLAGS A1, %2, %3
1774 EPILOGUE_2_ARGS
1775ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1776
1777BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1778 PROLOGUE_2_ARGS
1779 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1780 %1 word [A0]
1781 IEM_SAVE_FLAGS A1, %2, %3
1782 EPILOGUE_2_ARGS
1783ENDPROC iemAImpl_ %+ %1 %+ _u16
1784
1785BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1786 PROLOGUE_2_ARGS
1787 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1788 lock %1 word [A0]
1789 IEM_SAVE_FLAGS A1, %2, %3
1790 EPILOGUE_2_ARGS
1791ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1792
1793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1794 PROLOGUE_2_ARGS
1795 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1796 %1 dword [A0]
1797 IEM_SAVE_FLAGS A1, %2, %3
1798 EPILOGUE_2_ARGS
1799ENDPROC iemAImpl_ %+ %1 %+ _u32
1800
1801BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1802 PROLOGUE_2_ARGS
1803 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1804 lock %1 dword [A0]
1805 IEM_SAVE_FLAGS A1, %2, %3
1806 EPILOGUE_2_ARGS
1807ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1808
1809 %ifdef RT_ARCH_AMD64
1810BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1811 PROLOGUE_2_ARGS
1812 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1813 %1 qword [A0]
1814 IEM_SAVE_FLAGS A1, %2, %3
1815 EPILOGUE_2_ARGS
1816ENDPROC iemAImpl_ %+ %1 %+ _u64
1817
1818BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1819 PROLOGUE_2_ARGS
1820 IEM_MAYBE_LOAD_FLAGS A1, %2, %3, 0
1821 lock %1 qword [A0]
1822 IEM_SAVE_FLAGS A1, %2, %3
1823 EPILOGUE_2_ARGS
1824ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1825 %endif ; RT_ARCH_AMD64
1826
1827%endmacro
1828
1829IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1830IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1831IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1832IEMIMPL_UNARY_OP not, 0, 0
1833
1834
1835;
1836; BSWAP. No flag changes.
1837;
1838; Each function takes one argument, pointer to the value to bswap
1839; (input/output). They all return void.
1840;
1841BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1842 PROLOGUE_1_ARGS
1843 mov T0_32, [A0] ; just in case any of the upper bits are used.
1844 db 66h
1845 bswap T0_32
1846 mov [A0], T0_32
1847 EPILOGUE_1_ARGS
1848ENDPROC iemAImpl_bswap_u16
1849
1850BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1851 PROLOGUE_1_ARGS
1852 mov T0_32, [A0]
1853 bswap T0_32
1854 mov [A0], T0_32
1855 EPILOGUE_1_ARGS
1856ENDPROC iemAImpl_bswap_u32
1857
1858BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1859%ifdef RT_ARCH_AMD64
1860 PROLOGUE_1_ARGS
1861 mov T0, [A0]
1862 bswap T0
1863 mov [A0], T0
1864 EPILOGUE_1_ARGS
1865%else
1866 PROLOGUE_1_ARGS
1867 mov T0, [A0]
1868 mov T1, [A0 + 4]
1869 bswap T0
1870 bswap T1
1871 mov [A0 + 4], T0
1872 mov [A0], T1
1873 EPILOGUE_1_ARGS
1874%endif
1875ENDPROC iemAImpl_bswap_u64
1876
1877
1878;;
1879; Macro for implementing a shift operation.
1880;
1881; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1882; 32-bit system where the 64-bit accesses requires hand coding.
1883;
1884; All the functions takes a pointer to the destination memory operand in A0,
1885; the shift count in A1 and a pointer to eflags in A2.
1886;
1887; @param 1 The instruction mnemonic.
1888; @param 2 The modified flags.
1889; @param 3 The undefined flags.
1890; @param 4 Force load flags.
1891;
1892; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1893;
1894; @note the _intel and _amd variants are implemented in C.
1895;
1896%macro IEMIMPL_SHIFT_OP 4
1897BEGINCODE
1898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1899 PROLOGUE_3_ARGS
1900 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1901 %ifdef ASM_CALL64_GCC
1902 mov cl, A1_8
1903 %1 byte [A0], cl
1904 %else
1905 xchg A1, A0
1906 %1 byte [A1], cl
1907 %endif
1908 IEM_SAVE_FLAGS A2, %2, %3
1909 EPILOGUE_3_ARGS
1910ENDPROC iemAImpl_ %+ %1 %+ _u8
1911
1912BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1913 PROLOGUE_3_ARGS
1914 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1915 %ifdef ASM_CALL64_GCC
1916 mov cl, A1_8
1917 %1 word [A0], cl
1918 %else
1919 xchg A1, A0
1920 %1 word [A1], cl
1921 %endif
1922 IEM_SAVE_FLAGS A2, %2, %3
1923 EPILOGUE_3_ARGS
1924ENDPROC iemAImpl_ %+ %1 %+ _u16
1925
1926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1927 PROLOGUE_3_ARGS
1928 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1929 %ifdef ASM_CALL64_GCC
1930 mov cl, A1_8
1931 %1 dword [A0], cl
1932 %else
1933 xchg A1, A0
1934 %1 dword [A1], cl
1935 %endif
1936 IEM_SAVE_FLAGS A2, %2, %3
1937 EPILOGUE_3_ARGS
1938ENDPROC iemAImpl_ %+ %1 %+ _u32
1939
1940 %ifdef RT_ARCH_AMD64
1941BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1942 PROLOGUE_3_ARGS
1943 IEM_MAYBE_LOAD_FLAGS A2, %2, %3, %4
1944 %ifdef ASM_CALL64_GCC
1945 mov cl, A1_8
1946 %1 qword [A0], cl
1947 %else
1948 xchg A1, A0
1949 %1 qword [A1], cl
1950 %endif
1951 IEM_SAVE_FLAGS A2, %2, %3
1952 EPILOGUE_3_ARGS
1953ENDPROC iemAImpl_ %+ %1 %+ _u64
1954 %endif ; RT_ARCH_AMD64
1955
1956%endmacro
1957
1958;; @todo some questions wrt flags when the shift count is high according to intel docs...
1959IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1960IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1961IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1962IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0, X86_EFL_CF
1963IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1964IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1965IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF), 0
1966
1967
1968;;
1969; Macro for implementing a double precision shift operation.
1970;
1971; This will generate code for the 16, 32 and 64 bit accesses, except on
1972; 32-bit system where the 64-bit accesses requires hand coding.
1973;
1974; The functions takes the destination operand (r/m) in A0, the source (reg) in
1975; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1976;
1977; @param 1 The instruction mnemonic.
1978; @param 2 The modified flags.
1979; @param 3 The undefined flags.
1980;
1981; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1982;
1983; @note the _intel and _amd variants are implemented in C.
1984;
1985%macro IEMIMPL_SHIFT_DBL_OP 3
1986BEGINCODE
1987BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1988 PROLOGUE_4_ARGS
1989 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1990 %ifdef ASM_CALL64_GCC
1991 xchg A3, A2
1992 %1 [A0], A1_16, cl
1993 xchg A3, A2
1994 %else
1995 xchg A0, A2
1996 %1 [A2], A1_16, cl
1997 %endif
1998 IEM_SAVE_FLAGS A3, %2, %3
1999 EPILOGUE_4_ARGS
2000ENDPROC iemAImpl_ %+ %1 %+ _u16
2001
2002BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
2003 PROLOGUE_4_ARGS
2004 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2005 %ifdef ASM_CALL64_GCC
2006 xchg A3, A2
2007 %1 [A0], A1_32, cl
2008 xchg A3, A2
2009 %else
2010 xchg A0, A2
2011 %1 [A2], A1_32, cl
2012 %endif
2013 IEM_SAVE_FLAGS A3, %2, %3
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32
2016
2017 %ifdef RT_ARCH_AMD64
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 %ifdef ASM_CALL64_GCC
2022 xchg A3, A2
2023 %1 [A0], A1, cl
2024 xchg A3, A2
2025 %else
2026 xchg A0, A2
2027 %1 [A2], A1, cl
2028 %endif
2029 IEM_SAVE_FLAGS A3, %2, %3
2030 EPILOGUE_4_ARGS_EX 12
2031ENDPROC iemAImpl_ %+ %1 %+ _u64
2032 %endif ; RT_ARCH_AMD64
2033
2034%endmacro
2035
2036IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2037IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
2038
2039
2040;;
2041; Macro for implementing a multiplication operations.
2042;
2043; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2044; 32-bit system where the 64-bit accesses requires hand coding.
2045;
2046; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2047; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2048; pointer to eflags in A3.
2049;
2050; The functions all return 0 so the caller can be used for div/idiv as well as
2051; for the mul/imul implementation.
2052;
2053; @param 1 The instruction mnemonic.
2054; @param 2 The modified flags.
2055; @param 3 The undefined flags.
2056; @param 4 Name suffix.
2057; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
2058;
2059; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2060;
2061%macro IEMIMPL_MUL_OP 5
2062BEGINCODE
2063BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
2064 PROLOGUE_3_ARGS
2065 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2066 mov al, [A0]
2067 %1 A1_8
2068 mov [A0], ax
2069 %if %5 != 1
2070 IEM_SAVE_FLAGS A2, %2, %3
2071 %else
2072 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
2073 %endif
2074 xor eax, eax
2075 EPILOGUE_3_ARGS
2076ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
2077
2078BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
2079 PROLOGUE_4_ARGS
2080 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2081 mov ax, [A0]
2082 %ifdef ASM_CALL64_GCC
2083 %1 A2_16
2084 mov [A0], ax
2085 mov [A1], dx
2086 %else
2087 mov T1, A1
2088 %1 A2_16
2089 mov [A0], ax
2090 mov [T1], dx
2091 %endif
2092 %if %5 != 1
2093 IEM_SAVE_FLAGS A3, %2, %3
2094 %else
2095 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
2096 %endif
2097 xor eax, eax
2098 EPILOGUE_4_ARGS
2099ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
2100
2101BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
2102 PROLOGUE_4_ARGS
2103 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2104 mov eax, [A0]
2105 %ifdef ASM_CALL64_GCC
2106 %1 A2_32
2107 mov [A0], eax
2108 mov [A1], edx
2109 %else
2110 mov T1, A1
2111 %1 A2_32
2112 mov [A0], eax
2113 mov [T1], edx
2114 %endif
2115 %if %5 != 1
2116 IEM_SAVE_FLAGS A3, %2, %3
2117 %else
2118 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2119 %endif
2120 xor eax, eax
2121 EPILOGUE_4_ARGS
2122ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2123
2124 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2125BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2126 PROLOGUE_4_ARGS
2127 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2128 mov rax, [A0]
2129 %ifdef ASM_CALL64_GCC
2130 %1 A2
2131 mov [A0], rax
2132 mov [A1], rdx
2133 %else
2134 mov T1, A1
2135 %1 A2
2136 mov [A0], rax
2137 mov [T1], rdx
2138 %endif
2139 %if %5 != 1
2140 IEM_SAVE_FLAGS A3, %2, %3
2141 %else
2142 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2143 %endif
2144 xor eax, eax
2145 EPILOGUE_4_ARGS_EX 12
2146ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2147 %endif ; !RT_ARCH_AMD64
2148
2149%endmacro
2150
2151IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2152IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2153IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2154IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2155IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2156IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2157
2158
2159BEGINCODE
2160;;
2161; Worker function for negating a 32-bit number in T1:T0
2162; @uses None (T0,T1)
2163BEGINPROC iemAImpl_negate_T0_T1_u32
2164 push 0
2165 push 0
2166 xchg T0_32, [xSP]
2167 xchg T1_32, [xSP + xCB]
2168 sub T0_32, [xSP]
2169 sbb T1_32, [xSP + xCB]
2170 add xSP, xCB*2
2171 ret
2172ENDPROC iemAImpl_negate_T0_T1_u32
2173
2174%ifdef RT_ARCH_AMD64
2175;;
2176; Worker function for negating a 64-bit number in T1:T0
2177; @uses None (T0,T1)
2178BEGINPROC iemAImpl_negate_T0_T1_u64
2179 push 0
2180 push 0
2181 xchg T0, [xSP]
2182 xchg T1, [xSP + xCB]
2183 sub T0, [xSP]
2184 sbb T1, [xSP + xCB]
2185 add xSP, xCB*2
2186 ret
2187ENDPROC iemAImpl_negate_T0_T1_u64
2188%endif
2189
2190
2191;;
2192; Macro for implementing a division operations.
2193;
2194; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2195; 32-bit system where the 64-bit accesses requires hand coding.
2196;
2197; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2198; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2199; pointer to eflags in A3.
2200;
2201; The functions all return 0 on success and -1 if a divide error should be
2202; raised by the caller.
2203;
2204; @param 1 The instruction mnemonic.
2205; @param 2 The modified flags.
2206; @param 3 The undefined flags.
2207; @param 4 1 if signed, 0 if unsigned.
2208; @param 5 Function suffix.
2209; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2210; 2 for AMD (set AF, clear PF, ZF and SF).
2211;
2212; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2213;
2214%macro IEMIMPL_DIV_OP 6
2215BEGINCODE
2216BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2217 PROLOGUE_3_ARGS
2218
2219 ; div by chainsaw check.
2220 and A1_32, 0xff ; Ensure it's zero extended to 16-bits for the idiv range check.
2221 jz .div_zero
2222
2223 ; Overflow check - unsigned division is simple to verify, haven't
2224 ; found a simple way to check signed division yet unfortunately.
2225 %if %4 == 0
2226 cmp [A0 + 1], A1_8
2227 jae .div_overflow
2228 %else
2229 movzx T0_32, word [A0] ; T0 = dividend (zero extending to full register to simplify register aliasing)
2230 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2231 test A1_8, A1_8
2232 js .divisor_negative
2233 test T0_16, T0_16
2234 jns .both_positive
2235 neg T0_16
2236.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2237 push T0 ; Start off like unsigned below.
2238 shr T0_16, 7
2239 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2240 pop T0
2241 jb .div_no_overflow
2242 ja .div_overflow
2243 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2244 cmp T0_8, A1_8
2245 jae .div_overflow
2246 jmp .div_no_overflow
2247
2248.divisor_negative:
2249 neg A1_8
2250 test T0_16, T0_16
2251 jns .one_of_each
2252 neg T0_16
2253.both_positive: ; Same as unsigned shifted by sign indicator bit.
2254 shr T0_16, 7
2255 cmp T0_16, A1_16 ; 16-bit compare, since T0_16=0x8000 >> 7 --> T0_16=0x0100. (neg 0x8000 = 0x8000)
2256 jae .div_overflow
2257.div_no_overflow:
2258 mov A1, T1 ; restore divisor
2259 %endif
2260
2261 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2262 mov ax, [A0]
2263 %1 A1_8
2264 mov [A0], ax
2265 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2266 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2267 %else
2268 IEM_SAVE_FLAGS A2, %2, %3
2269 %endif
2270 xor eax, eax
2271
2272.return:
2273 EPILOGUE_3_ARGS
2274
2275.div_zero:
2276.div_overflow:
2277 mov eax, -1
2278 jmp .return
2279ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2280
2281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2282 PROLOGUE_4_ARGS
2283
2284 ; div by chainsaw check.
2285 and A2_16, 0xffff ; Zero extend it for simpler sign overflow checks (see below).
2286 jz .div_zero
2287
2288 ; Overflow check - unsigned division is simple to verify, haven't
2289 ; found a simple way to check signed division yet unfortunately.
2290 %if %4 == 0
2291 cmp [A1], A2_16
2292 jae .div_overflow
2293 %else
2294 movzx T0_32, word [A1] ; Zero extend to simplify register aliasing by clobbing the whole register.
2295 shl T0_32, 16
2296 mov T0_16, [A0] ; T0 = dividend
2297 mov T1, A2 ; T1 = divisor
2298 test T1_16, T1_16
2299 js .divisor_negative
2300 test T0_32, T0_32
2301 jns .both_positive
2302 neg T0_32
2303.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2304 push T0 ; Start off like unsigned below.
2305 shr T0_32, 15
2306 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2307 pop T0
2308 jb .div_no_overflow
2309 ja .div_overflow
2310 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2311 cmp T0_16, T1_16
2312 jae .div_overflow
2313 jmp .div_no_overflow
2314
2315.divisor_negative:
2316 neg T1_16
2317 test T0_32, T0_32
2318 jns .one_of_each
2319 neg T0_32
2320.both_positive: ; Same as unsigned shifted by sign indicator bit.
2321 shr T0_32, 15
2322 cmp T0_32, T1_32 ; 32-bit compares, because 0x80000000 >> 15 = 0x10000 (65536) which doesn't fit in 16 bits.
2323 jae .div_overflow
2324.div_no_overflow:
2325 %endif
2326
2327 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2328 %ifdef ASM_CALL64_GCC
2329 mov T1, A2
2330 mov ax, [A0]
2331 mov dx, [A1]
2332 %1 T1_16
2333 mov [A0], ax
2334 mov [A1], dx
2335 %else
2336 mov T1, A1
2337 mov ax, [A0]
2338 mov dx, [T1]
2339 %1 A2_16
2340 mov [A0], ax
2341 mov [T1], dx
2342 %endif
2343 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2344 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2345 %else
2346 IEM_SAVE_FLAGS A3, %2, %3
2347 %endif
2348 xor eax, eax
2349
2350.return:
2351 EPILOGUE_4_ARGS
2352
2353.div_zero:
2354.div_overflow:
2355 mov eax, -1
2356 jmp .return
2357ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2358
2359BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2360 PROLOGUE_4_ARGS
2361
2362 ; div by chainsaw check.
2363 test A2_32, A2_32
2364 jz .div_zero
2365
2366 ; Overflow check - unsigned division is simple to verify, haven't
2367 ; found a simple way to check signed division yet unfortunately.
2368 %if %4 == 0
2369 cmp [A1], A2_32
2370 jae .div_overflow
2371 %else
2372 push A2 ; save A2 so we modify it (we out of regs on x86).
2373 mov T0_32, [A0] ; T0 = dividend low
2374 mov T1_32, [A1] ; T1 = dividend high
2375 ;test A2_32, A2_32 - we did this 5 instructions ago.
2376 js .divisor_negative
2377 test T1_32, T1_32
2378 jns .both_positive
2379 call NAME(iemAImpl_negate_T0_T1_u32)
2380.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2381 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2382 jnz .div_overflow
2383 push T0 ; Start off like unsigned below.
2384 shl T1_32, 1
2385 shr T0_32, 31
2386 or T1_32, T0_32
2387 cmp T1_32, A2_32
2388 pop T0
2389 jb .div_no_overflow
2390 ja .div_overflow
2391 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2392 cmp T0_32, A2_32
2393 jae .div_overflow
2394 jmp .div_no_overflow
2395
2396.divisor_negative:
2397 neg A2_32
2398 test T1_32, T1_32
2399 jns .one_of_each
2400 call NAME(iemAImpl_negate_T0_T1_u32)
2401.both_positive: ; Same as unsigned shifted by sign indicator bit.
2402 test T1_32, 0x80000000 ; neg 0x8000000000000000 = 0x8000000000000000
2403 jnz .div_overflow
2404 shl T1_32, 1
2405 shr T0_32, 31
2406 or T1_32, T0_32
2407 cmp T1_32, A2_32
2408 jae .div_overflow
2409.div_no_overflow:
2410 pop A2
2411 %endif
2412
2413 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2414 mov eax, [A0]
2415 %ifdef ASM_CALL64_GCC
2416 mov T1, A2
2417 mov eax, [A0]
2418 mov edx, [A1]
2419 %1 T1_32
2420 mov [A0], eax
2421 mov [A1], edx
2422 %else
2423 mov T1, A1
2424 mov eax, [A0]
2425 mov edx, [T1]
2426 %1 A2_32
2427 mov [A0], eax
2428 mov [T1], edx
2429 %endif
2430 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2431 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2432 %else
2433 IEM_SAVE_FLAGS A3, %2, %3
2434 %endif
2435 xor eax, eax
2436
2437.return:
2438 EPILOGUE_4_ARGS
2439
2440.div_overflow:
2441 %if %4 != 0
2442 pop A2
2443 %endif
2444.div_zero:
2445 mov eax, -1
2446 jmp .return
2447ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2448
2449 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2450BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2451 PROLOGUE_4_ARGS
2452
2453 test A2, A2
2454 jz .div_zero
2455 %if %4 == 0
2456 cmp [A1], A2
2457 jae .div_overflow
2458 %else
2459 push A2 ; save A2 so we modify it (we out of regs on x86).
2460 mov T0, [A0] ; T0 = dividend low
2461 mov T1, [A1] ; T1 = dividend high
2462 ;test A2, A2 - we did this five instructions above.
2463 js .divisor_negative
2464 test T1, T1
2465 jns .both_positive
2466 call NAME(iemAImpl_negate_T0_T1_u64)
2467.one_of_each: ; OK range is 2^(result-width - 1) + (divisor - 1).
2468 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2469 jc .div_overflow
2470 push T0 ; Start off like unsigned below.
2471 shl T1, 1
2472 shr T0, 63
2473 or T1, T0
2474 cmp T1, A2
2475 pop T0
2476 jb .div_no_overflow
2477 ja .div_overflow
2478 mov T1, 0x7fffffffffffffff
2479 and T0, T1 ; Special case for covering (divisor - 1).
2480 cmp T0, A2
2481 jae .div_overflow
2482 jmp .div_no_overflow
2483
2484.divisor_negative:
2485 neg A2
2486 test T1, T1
2487 jns .one_of_each
2488 call NAME(iemAImpl_negate_T0_T1_u64)
2489.both_positive: ; Same as unsigned shifted by sign indicator bit.
2490 bt T1, 63 ; neg 0x8000000000000000'0000000000000000 = same
2491 jc .div_overflow
2492 shl T1, 1
2493 shr T0, 63
2494 or T1, T0
2495 cmp T1, A2
2496 jae .div_overflow
2497.div_no_overflow:
2498 pop A2
2499 %endif
2500
2501 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2502 mov rax, [A0]
2503 %ifdef ASM_CALL64_GCC
2504 mov T1, A2
2505 mov rax, [A0]
2506 mov rdx, [A1]
2507 %1 T1
2508 mov [A0], rax
2509 mov [A1], rdx
2510 %else
2511 mov T1, A1
2512 mov rax, [A0]
2513 mov rdx, [T1]
2514 %1 A2
2515 mov [A0], rax
2516 mov [T1], rdx
2517 %endif
2518 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2519 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2520 %else
2521 IEM_SAVE_FLAGS A3, %2, %3
2522 %endif
2523 xor eax, eax
2524
2525.return:
2526 EPILOGUE_4_ARGS_EX 12
2527
2528.div_overflow:
2529 %if %4 != 0
2530 pop A2
2531 %endif
2532.div_zero:
2533 mov eax, -1
2534 jmp .return
2535ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2536 %endif ; !RT_ARCH_AMD64
2537
2538%endmacro
2539
2540IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2541IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2542IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2543;; @todo overflows with AX=0x8000 DL=0xc7 IDIV DL
2544IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2545IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2546IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2547
2548
2549;;
2550; Macro for implementing memory fence operation.
2551;
2552; No return value, no operands or anything.
2553;
2554; @param 1 The instruction.
2555;
2556%macro IEMIMPL_MEM_FENCE 1
2557BEGINCODE
2558BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2559 %1
2560 ret
2561ENDPROC iemAImpl_ %+ %1
2562%endmacro
2563
2564IEMIMPL_MEM_FENCE lfence
2565IEMIMPL_MEM_FENCE sfence
2566IEMIMPL_MEM_FENCE mfence
2567
2568;;
2569; Alternative for non-SSE2 host.
2570;
2571BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2572 push xAX
2573 xchg xAX, [xSP]
2574 add xSP, xCB
2575 ret
2576ENDPROC iemAImpl_alt_mem_fence
2577
2578
2579;;
2580; Initialize the FPU for the actual instruction being emulated, this means
2581; loading parts of the guest's control word and status word.
2582;
2583; @uses 24 bytes of stack. T0, T1
2584; @param 1 Expression giving the address of the FXSTATE of the guest.
2585;
2586%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2587 fnstenv [xSP]
2588
2589 ; FCW - for exception, precision and rounding control.
2590 movzx T0, word [%1 + X86FXSTATE.FCW]
2591 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2592 mov [xSP + X86FSTENV32P.FCW], T0_16
2593
2594 ; FSW - for undefined C0, C1, C2, and C3.
2595 movzx T1, word [%1 + X86FXSTATE.FSW]
2596 and T1, X86_FSW_C_MASK
2597 movzx T0, word [xSP + X86FSTENV32P.FSW]
2598 and T0, X86_FSW_TOP_MASK
2599 or T0, T1
2600 mov [xSP + X86FSTENV32P.FSW], T0_16
2601
2602 fldenv [xSP]
2603%endmacro
2604
2605
2606;;
2607; Initialize the FPU for the actual instruction being emulated, this means
2608; loading parts of the guest's control word, status word, and update the
2609; tag word for the top register if it's empty.
2610;
2611; ASSUMES actual TOP=7
2612;
2613; @uses 24 bytes of stack. T0, T1
2614; @param 1 Expression giving the address of the FXSTATE of the guest.
2615;
2616%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2617 fnstenv [xSP]
2618
2619 ; FCW - for exception, precision and rounding control.
2620 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2621 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2622 mov [xSP + X86FSTENV32P.FCW], T0_16
2623
2624 ; FSW - for undefined C0, C1, C2, and C3.
2625 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2626 and T1_32, X86_FSW_C_MASK
2627 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2628 and T0_32, X86_FSW_TOP_MASK
2629 or T0_32, T1_32
2630 mov [xSP + X86FSTENV32P.FSW], T0_16
2631
2632 ; FTW - Only for ST0 (in/out).
2633 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2634 shr T1_32, X86_FSW_TOP_SHIFT
2635 and T1_32, X86_FSW_TOP_SMASK
2636 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2637 jc %%st0_not_empty
2638 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2639%%st0_not_empty:
2640
2641 fldenv [xSP]
2642%endmacro
2643
2644
2645;;
2646; Need to move this as well somewhere better?
2647;
2648struc IEMFPURESULT
2649 .r80Result resw 5
2650 .FSW resw 1
2651endstruc
2652
2653
2654;;
2655; Need to move this as well somewhere better?
2656;
2657struc IEMFPURESULTTWO
2658 .r80Result1 resw 5
2659 .FSW resw 1
2660 .r80Result2 resw 5
2661endstruc
2662
2663
2664;
2665;---------------------- 16-bit signed integer operations ----------------------
2666;
2667
2668
2669;;
2670; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2671;
2672; @param A0 FPU context (fxsave).
2673; @param A1 Pointer to a IEMFPURESULT for the output.
2674; @param A2 Pointer to the 16-bit floating point value to convert.
2675;
2676BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2677 PROLOGUE_3_ARGS
2678 sub xSP, 20h
2679
2680 fninit
2681 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2682 fild word [A2]
2683
2684 fnstsw word [A1 + IEMFPURESULT.FSW]
2685 fnclex
2686 fstp tword [A1 + IEMFPURESULT.r80Result]
2687
2688 fninit
2689 add xSP, 20h
2690 EPILOGUE_3_ARGS
2691ENDPROC iemAImpl_fild_r80_from_i16
2692
2693
2694;;
2695; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2696;
2697; @param A0 FPU context (fxsave).
2698; @param A1 Where to return the output FSW.
2699; @param A2 Where to store the 16-bit signed integer value.
2700; @param A3 Pointer to the 80-bit value.
2701;
2702BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2703 PROLOGUE_4_ARGS
2704 sub xSP, 20h
2705
2706 fninit
2707 fld tword [A3]
2708 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2709 fistp word [A2]
2710
2711 fnstsw word [A1]
2712
2713 fninit
2714 add xSP, 20h
2715 EPILOGUE_4_ARGS
2716ENDPROC iemAImpl_fist_r80_to_i16
2717
2718
2719;;
2720; Store a 80-bit floating point value (register) as a 16-bit signed integer
2721; (memory) with truncation.
2722;
2723; @param A0 FPU context (fxsave).
2724; @param A1 Where to return the output FSW.
2725; @param A2 Where to store the 16-bit signed integer value.
2726; @param A3 Pointer to the 80-bit value.
2727;
2728BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2729 PROLOGUE_4_ARGS
2730 sub xSP, 20h
2731
2732 fninit
2733 fld tword [A3]
2734 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2735 fisttp word [A2]
2736
2737 fnstsw word [A1]
2738
2739 fninit
2740 add xSP, 20h
2741 EPILOGUE_4_ARGS
2742ENDPROC iemAImpl_fistt_r80_to_i16
2743
2744
2745;;
2746; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2747;
2748; @param 1 The instruction
2749;
2750; @param A0 FPU context (fxsave).
2751; @param A1 Pointer to a IEMFPURESULT for the output.
2752; @param A2 Pointer to the 80-bit value.
2753; @param A3 Pointer to the 16-bit value.
2754;
2755%macro IEMIMPL_FPU_R80_BY_I16 1
2756BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2757 PROLOGUE_4_ARGS
2758 sub xSP, 20h
2759
2760 fninit
2761 fld tword [A2]
2762 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2763 %1 word [A3]
2764
2765 fnstsw word [A1 + IEMFPURESULT.FSW]
2766 fnclex
2767 fstp tword [A1 + IEMFPURESULT.r80Result]
2768
2769 fninit
2770 add xSP, 20h
2771 EPILOGUE_4_ARGS
2772ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2773%endmacro
2774
2775IEMIMPL_FPU_R80_BY_I16 fiadd
2776IEMIMPL_FPU_R80_BY_I16 fimul
2777IEMIMPL_FPU_R80_BY_I16 fisub
2778IEMIMPL_FPU_R80_BY_I16 fisubr
2779IEMIMPL_FPU_R80_BY_I16 fidiv
2780IEMIMPL_FPU_R80_BY_I16 fidivr
2781
2782
2783;;
2784; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2785; only returning FSW.
2786;
2787; @param 1 The instruction
2788;
2789; @param A0 FPU context (fxsave).
2790; @param A1 Where to store the output FSW.
2791; @param A2 Pointer to the 80-bit value.
2792; @param A3 Pointer to the 64-bit value.
2793;
2794%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2795BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2796 PROLOGUE_4_ARGS
2797 sub xSP, 20h
2798
2799 fninit
2800 fld tword [A2]
2801 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2802 %1 word [A3]
2803
2804 fnstsw word [A1]
2805
2806 fninit
2807 add xSP, 20h
2808 EPILOGUE_4_ARGS
2809ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2810%endmacro
2811
2812IEMIMPL_FPU_R80_BY_I16_FSW ficom
2813
2814
2815
2816;
2817;---------------------- 32-bit signed integer operations ----------------------
2818;
2819
2820
2821;;
2822; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2823;
2824; @param A0 FPU context (fxsave).
2825; @param A1 Pointer to a IEMFPURESULT for the output.
2826; @param A2 Pointer to the 32-bit floating point value to convert.
2827;
2828BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2829 PROLOGUE_3_ARGS
2830 sub xSP, 20h
2831
2832 fninit
2833 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2834 fild dword [A2]
2835
2836 fnstsw word [A1 + IEMFPURESULT.FSW]
2837 fnclex
2838 fstp tword [A1 + IEMFPURESULT.r80Result]
2839
2840 fninit
2841 add xSP, 20h
2842 EPILOGUE_3_ARGS
2843ENDPROC iemAImpl_fild_r80_from_i32
2844
2845
2846;;
2847; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2848;
2849; @param A0 FPU context (fxsave).
2850; @param A1 Where to return the output FSW.
2851; @param A2 Where to store the 32-bit signed integer value.
2852; @param A3 Pointer to the 80-bit value.
2853;
2854BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2855 PROLOGUE_4_ARGS
2856 sub xSP, 20h
2857
2858 fninit
2859 fld tword [A3]
2860 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2861 fistp dword [A2]
2862
2863 fnstsw word [A1]
2864
2865 fninit
2866 add xSP, 20h
2867 EPILOGUE_4_ARGS
2868ENDPROC iemAImpl_fist_r80_to_i32
2869
2870
2871;;
2872; Store a 80-bit floating point value (register) as a 32-bit signed integer
2873; (memory) with truncation.
2874;
2875; @param A0 FPU context (fxsave).
2876; @param A1 Where to return the output FSW.
2877; @param A2 Where to store the 32-bit signed integer value.
2878; @param A3 Pointer to the 80-bit value.
2879;
2880BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2881 PROLOGUE_4_ARGS
2882 sub xSP, 20h
2883
2884 fninit
2885 fld tword [A3]
2886 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2887 fisttp dword [A2]
2888
2889 fnstsw word [A1]
2890
2891 fninit
2892 add xSP, 20h
2893 EPILOGUE_4_ARGS
2894ENDPROC iemAImpl_fistt_r80_to_i32
2895
2896
2897;;
2898; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2899;
2900; @param 1 The instruction
2901;
2902; @param A0 FPU context (fxsave).
2903; @param A1 Pointer to a IEMFPURESULT for the output.
2904; @param A2 Pointer to the 80-bit value.
2905; @param A3 Pointer to the 32-bit value.
2906;
2907%macro IEMIMPL_FPU_R80_BY_I32 1
2908BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2909 PROLOGUE_4_ARGS
2910 sub xSP, 20h
2911
2912 fninit
2913 fld tword [A2]
2914 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2915 %1 dword [A3]
2916
2917 fnstsw word [A1 + IEMFPURESULT.FSW]
2918 fnclex
2919 fstp tword [A1 + IEMFPURESULT.r80Result]
2920
2921 fninit
2922 add xSP, 20h
2923 EPILOGUE_4_ARGS
2924ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2925%endmacro
2926
2927IEMIMPL_FPU_R80_BY_I32 fiadd
2928IEMIMPL_FPU_R80_BY_I32 fimul
2929IEMIMPL_FPU_R80_BY_I32 fisub
2930IEMIMPL_FPU_R80_BY_I32 fisubr
2931IEMIMPL_FPU_R80_BY_I32 fidiv
2932IEMIMPL_FPU_R80_BY_I32 fidivr
2933
2934
2935;;
2936; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2937; only returning FSW.
2938;
2939; @param 1 The instruction
2940;
2941; @param A0 FPU context (fxsave).
2942; @param A1 Where to store the output FSW.
2943; @param A2 Pointer to the 80-bit value.
2944; @param A3 Pointer to the 64-bit value.
2945;
2946%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2947BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2948 PROLOGUE_4_ARGS
2949 sub xSP, 20h
2950
2951 fninit
2952 fld tword [A2]
2953 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2954 %1 dword [A3]
2955
2956 fnstsw word [A1]
2957
2958 fninit
2959 add xSP, 20h
2960 EPILOGUE_4_ARGS
2961ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2962%endmacro
2963
2964IEMIMPL_FPU_R80_BY_I32_FSW ficom
2965
2966
2967
2968;
2969;---------------------- 64-bit signed integer operations ----------------------
2970;
2971
2972
2973;;
2974; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2975;
2976; @param A0 FPU context (fxsave).
2977; @param A1 Pointer to a IEMFPURESULT for the output.
2978; @param A2 Pointer to the 64-bit floating point value to convert.
2979;
2980BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2981 PROLOGUE_3_ARGS
2982 sub xSP, 20h
2983
2984 fninit
2985 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2986 fild qword [A2]
2987
2988 fnstsw word [A1 + IEMFPURESULT.FSW]
2989 fnclex
2990 fstp tword [A1 + IEMFPURESULT.r80Result]
2991
2992 fninit
2993 add xSP, 20h
2994 EPILOGUE_3_ARGS
2995ENDPROC iemAImpl_fild_r80_from_i64
2996
2997
2998;;
2999; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
3000;
3001; @param A0 FPU context (fxsave).
3002; @param A1 Where to return the output FSW.
3003; @param A2 Where to store the 64-bit signed integer value.
3004; @param A3 Pointer to the 80-bit value.
3005;
3006BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
3007 PROLOGUE_4_ARGS
3008 sub xSP, 20h
3009
3010 fninit
3011 fld tword [A3]
3012 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3013 fistp qword [A2]
3014
3015 fnstsw word [A1]
3016
3017 fninit
3018 add xSP, 20h
3019 EPILOGUE_4_ARGS
3020ENDPROC iemAImpl_fist_r80_to_i64
3021
3022
3023;;
3024; Store a 80-bit floating point value (register) as a 64-bit signed integer
3025; (memory) with truncation.
3026;
3027; @param A0 FPU context (fxsave).
3028; @param A1 Where to return the output FSW.
3029; @param A2 Where to store the 64-bit signed integer value.
3030; @param A3 Pointer to the 80-bit value.
3031;
3032BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
3033 PROLOGUE_4_ARGS
3034 sub xSP, 20h
3035
3036 fninit
3037 fld tword [A3]
3038 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3039 fisttp qword [A2]
3040
3041 fnstsw word [A1]
3042
3043 fninit
3044 add xSP, 20h
3045 EPILOGUE_4_ARGS
3046ENDPROC iemAImpl_fistt_r80_to_i64
3047
3048
3049
3050;
3051;---------------------- 32-bit floating point operations ----------------------
3052;
3053
3054;;
3055; Converts a 32-bit floating point value to a 80-bit one (fpu register).
3056;
3057; @param A0 FPU context (fxsave).
3058; @param A1 Pointer to a IEMFPURESULT for the output.
3059; @param A2 Pointer to the 32-bit floating point value to convert.
3060;
3061BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
3062 PROLOGUE_3_ARGS
3063 sub xSP, 20h
3064
3065 fninit
3066 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3067 fld dword [A2]
3068
3069 fnstsw word [A1 + IEMFPURESULT.FSW]
3070 fnclex
3071 fstp tword [A1 + IEMFPURESULT.r80Result]
3072
3073 fninit
3074 add xSP, 20h
3075 EPILOGUE_3_ARGS
3076ENDPROC iemAImpl_fld_r80_from_r32
3077
3078
3079;;
3080; Store a 80-bit floating point value (register) as a 32-bit one (memory).
3081;
3082; @param A0 FPU context (fxsave).
3083; @param A1 Where to return the output FSW.
3084; @param A2 Where to store the 32-bit value.
3085; @param A3 Pointer to the 80-bit value.
3086;
3087BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
3088 PROLOGUE_4_ARGS
3089 sub xSP, 20h
3090
3091 fninit
3092 fld tword [A3]
3093 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3094 fst dword [A2]
3095
3096 fnstsw word [A1]
3097
3098 fninit
3099 add xSP, 20h
3100 EPILOGUE_4_ARGS
3101ENDPROC iemAImpl_fst_r80_to_r32
3102
3103
3104;;
3105; FPU instruction working on one 80-bit and one 32-bit floating point value.
3106;
3107; @param 1 The instruction
3108;
3109; @param A0 FPU context (fxsave).
3110; @param A1 Pointer to a IEMFPURESULT for the output.
3111; @param A2 Pointer to the 80-bit value.
3112; @param A3 Pointer to the 32-bit value.
3113;
3114%macro IEMIMPL_FPU_R80_BY_R32 1
3115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3116 PROLOGUE_4_ARGS
3117 sub xSP, 20h
3118
3119 fninit
3120 fld tword [A2]
3121 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3122 %1 dword [A3]
3123
3124 fnstsw word [A1 + IEMFPURESULT.FSW]
3125 fnclex
3126 fstp tword [A1 + IEMFPURESULT.r80Result]
3127
3128 fninit
3129 add xSP, 20h
3130 EPILOGUE_4_ARGS
3131ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3132%endmacro
3133
3134IEMIMPL_FPU_R80_BY_R32 fadd
3135IEMIMPL_FPU_R80_BY_R32 fmul
3136IEMIMPL_FPU_R80_BY_R32 fsub
3137IEMIMPL_FPU_R80_BY_R32 fsubr
3138IEMIMPL_FPU_R80_BY_R32 fdiv
3139IEMIMPL_FPU_R80_BY_R32 fdivr
3140
3141
3142;;
3143; FPU instruction working on one 80-bit and one 32-bit floating point value,
3144; only returning FSW.
3145;
3146; @param 1 The instruction
3147;
3148; @param A0 FPU context (fxsave).
3149; @param A1 Where to store the output FSW.
3150; @param A2 Pointer to the 80-bit value.
3151; @param A3 Pointer to the 64-bit value.
3152;
3153%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3154BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3155 PROLOGUE_4_ARGS
3156 sub xSP, 20h
3157
3158 fninit
3159 fld tword [A2]
3160 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3161 %1 dword [A3]
3162
3163 fnstsw word [A1]
3164
3165 fninit
3166 add xSP, 20h
3167 EPILOGUE_4_ARGS
3168ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3169%endmacro
3170
3171IEMIMPL_FPU_R80_BY_R32_FSW fcom
3172
3173
3174
3175;
3176;---------------------- 64-bit floating point operations ----------------------
3177;
3178
3179;;
3180; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3181;
3182; @param A0 FPU context (fxsave).
3183; @param A1 Pointer to a IEMFPURESULT for the output.
3184; @param A2 Pointer to the 64-bit floating point value to convert.
3185;
3186BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3187 PROLOGUE_3_ARGS
3188 sub xSP, 20h
3189
3190 fninit
3191 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3192 fld qword [A2]
3193
3194 fnstsw word [A1 + IEMFPURESULT.FSW]
3195 fnclex
3196 fstp tword [A1 + IEMFPURESULT.r80Result]
3197
3198 fninit
3199 add xSP, 20h
3200 EPILOGUE_3_ARGS
3201ENDPROC iemAImpl_fld_r80_from_r64
3202
3203
3204;;
3205; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3206;
3207; @param A0 FPU context (fxsave).
3208; @param A1 Where to return the output FSW.
3209; @param A2 Where to store the 64-bit value.
3210; @param A3 Pointer to the 80-bit value.
3211;
3212BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3213 PROLOGUE_4_ARGS
3214 sub xSP, 20h
3215
3216 fninit
3217 fld tword [A3]
3218 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3219 fst qword [A2]
3220
3221 fnstsw word [A1]
3222
3223 fninit
3224 add xSP, 20h
3225 EPILOGUE_4_ARGS
3226ENDPROC iemAImpl_fst_r80_to_r64
3227
3228
3229;;
3230; FPU instruction working on one 80-bit and one 64-bit floating point value.
3231;
3232; @param 1 The instruction
3233;
3234; @param A0 FPU context (fxsave).
3235; @param A1 Pointer to a IEMFPURESULT for the output.
3236; @param A2 Pointer to the 80-bit value.
3237; @param A3 Pointer to the 64-bit value.
3238;
3239%macro IEMIMPL_FPU_R80_BY_R64 1
3240BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3241 PROLOGUE_4_ARGS
3242 sub xSP, 20h
3243
3244 fninit
3245 fld tword [A2]
3246 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3247 %1 qword [A3]
3248
3249 fnstsw word [A1 + IEMFPURESULT.FSW]
3250 fnclex
3251 fstp tword [A1 + IEMFPURESULT.r80Result]
3252
3253 fninit
3254 add xSP, 20h
3255 EPILOGUE_4_ARGS
3256ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3257%endmacro
3258
3259IEMIMPL_FPU_R80_BY_R64 fadd
3260IEMIMPL_FPU_R80_BY_R64 fmul
3261IEMIMPL_FPU_R80_BY_R64 fsub
3262IEMIMPL_FPU_R80_BY_R64 fsubr
3263IEMIMPL_FPU_R80_BY_R64 fdiv
3264IEMIMPL_FPU_R80_BY_R64 fdivr
3265
3266;;
3267; FPU instruction working on one 80-bit and one 64-bit floating point value,
3268; only returning FSW.
3269;
3270; @param 1 The instruction
3271;
3272; @param A0 FPU context (fxsave).
3273; @param A1 Where to store the output FSW.
3274; @param A2 Pointer to the 80-bit value.
3275; @param A3 Pointer to the 64-bit value.
3276;
3277%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3278BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3279 PROLOGUE_4_ARGS
3280 sub xSP, 20h
3281
3282 fninit
3283 fld tword [A2]
3284 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3285 %1 qword [A3]
3286
3287 fnstsw word [A1]
3288
3289 fninit
3290 add xSP, 20h
3291 EPILOGUE_4_ARGS
3292ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3293%endmacro
3294
3295IEMIMPL_FPU_R80_BY_R64_FSW fcom
3296
3297
3298
3299;
3300;---------------------- 80-bit floating point operations ----------------------
3301;
3302
3303;;
3304; Loads a 80-bit floating point register value from memory.
3305;
3306; @param A0 FPU context (fxsave).
3307; @param A1 Pointer to a IEMFPURESULT for the output.
3308; @param A2 Pointer to the 80-bit floating point value to load.
3309;
3310BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3311 PROLOGUE_3_ARGS
3312 sub xSP, 20h
3313
3314 fninit
3315 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3316 fld tword [A2]
3317
3318 fnstsw word [A1 + IEMFPURESULT.FSW]
3319 fnclex
3320 fstp tword [A1 + IEMFPURESULT.r80Result]
3321
3322 fninit
3323 add xSP, 20h
3324 EPILOGUE_3_ARGS
3325ENDPROC iemAImpl_fld_r80_from_r80
3326
3327
3328;;
3329; Store a 80-bit floating point register to memory
3330;
3331; @param A0 FPU context (fxsave).
3332; @param A1 Where to return the output FSW.
3333; @param A2 Where to store the 80-bit value.
3334; @param A3 Pointer to the 80-bit register value.
3335;
3336BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3337 PROLOGUE_4_ARGS
3338 sub xSP, 20h
3339
3340 fninit
3341 fld tword [A3]
3342 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3343 fstp tword [A2]
3344
3345 fnstsw word [A1]
3346
3347 fninit
3348 add xSP, 20h
3349 EPILOGUE_4_ARGS
3350ENDPROC iemAImpl_fst_r80_to_r80
3351
3352
3353;;
3354; Loads an 80-bit floating point register value in BCD format from memory.
3355;
3356; @param A0 FPU context (fxsave).
3357; @param A1 Pointer to a IEMFPURESULT for the output.
3358; @param A2 Pointer to the 80-bit BCD value to load.
3359;
3360BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3361 PROLOGUE_3_ARGS
3362 sub xSP, 20h
3363
3364 fninit
3365 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3366 fbld tword [A2]
3367
3368 fnstsw word [A1 + IEMFPURESULT.FSW]
3369 fnclex
3370 fstp tword [A1 + IEMFPURESULT.r80Result]
3371
3372 fninit
3373 add xSP, 20h
3374 EPILOGUE_3_ARGS
3375ENDPROC iemAImpl_fld_r80_from_d80
3376
3377
3378;;
3379; Store a 80-bit floating point register to memory as BCD
3380;
3381; @param A0 FPU context (fxsave).
3382; @param A1 Where to return the output FSW.
3383; @param A2 Where to store the 80-bit BCD value.
3384; @param A3 Pointer to the 80-bit register value.
3385;
3386BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3387 PROLOGUE_4_ARGS
3388 sub xSP, 20h
3389
3390 fninit
3391 fld tword [A3]
3392 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3393 fbstp tword [A2]
3394
3395 fnstsw word [A1]
3396
3397 fninit
3398 add xSP, 20h
3399 EPILOGUE_4_ARGS
3400ENDPROC iemAImpl_fst_r80_to_d80
3401
3402
3403;;
3404; FPU instruction working on two 80-bit floating point values.
3405;
3406; @param 1 The instruction
3407;
3408; @param A0 FPU context (fxsave).
3409; @param A1 Pointer to a IEMFPURESULT for the output.
3410; @param A2 Pointer to the first 80-bit value (ST0)
3411; @param A3 Pointer to the second 80-bit value (STn).
3412;
3413%macro IEMIMPL_FPU_R80_BY_R80 2
3414BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3415 PROLOGUE_4_ARGS
3416 sub xSP, 20h
3417
3418 fninit
3419 fld tword [A3]
3420 fld tword [A2]
3421 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3422 %1 %2
3423
3424 fnstsw word [A1 + IEMFPURESULT.FSW]
3425 fnclex
3426 fstp tword [A1 + IEMFPURESULT.r80Result]
3427
3428 fninit
3429 add xSP, 20h
3430 EPILOGUE_4_ARGS
3431ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3432%endmacro
3433
3434IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3435IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3436IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3437IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3438IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3439IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3440IEMIMPL_FPU_R80_BY_R80 fprem, {}
3441IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3442IEMIMPL_FPU_R80_BY_R80 fscale, {}
3443
3444
3445;;
3446; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3447; storing the result in ST1 and popping the stack.
3448;
3449; @param 1 The instruction
3450;
3451; @param A0 FPU context (fxsave).
3452; @param A1 Pointer to a IEMFPURESULT for the output.
3453; @param A2 Pointer to the first 80-bit value (ST1).
3454; @param A3 Pointer to the second 80-bit value (ST0).
3455;
3456%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3457BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3458 PROLOGUE_4_ARGS
3459 sub xSP, 20h
3460
3461 fninit
3462 fld tword [A2]
3463 fld tword [A3]
3464 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3465 %1
3466
3467 fnstsw word [A1 + IEMFPURESULT.FSW]
3468 fnclex
3469 fstp tword [A1 + IEMFPURESULT.r80Result]
3470
3471 fninit
3472 add xSP, 20h
3473 EPILOGUE_4_ARGS
3474ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3475%endmacro
3476
3477IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3478IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3479IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3480
3481
3482;;
3483; FPU instruction working on two 80-bit floating point values, only
3484; returning FSW.
3485;
3486; @param 1 The instruction
3487;
3488; @param A0 FPU context (fxsave).
3489; @param A1 Pointer to a uint16_t for the resulting FSW.
3490; @param A2 Pointer to the first 80-bit value.
3491; @param A3 Pointer to the second 80-bit value.
3492;
3493%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3494BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3495 PROLOGUE_4_ARGS
3496 sub xSP, 20h
3497
3498 fninit
3499 fld tword [A3]
3500 fld tword [A2]
3501 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3502 %1 st0, st1
3503
3504 fnstsw word [A1]
3505
3506 fninit
3507 add xSP, 20h
3508 EPILOGUE_4_ARGS
3509ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3510%endmacro
3511
3512IEMIMPL_FPU_R80_BY_R80_FSW fcom
3513IEMIMPL_FPU_R80_BY_R80_FSW fucom
3514
3515
3516;;
3517; FPU instruction working on two 80-bit floating point values,
3518; returning FSW and EFLAGS (eax).
3519;
3520; @param 1 The instruction
3521;
3522; @returns EFLAGS in EAX.
3523; @param A0 FPU context (fxsave).
3524; @param A1 Pointer to a uint16_t for the resulting FSW.
3525; @param A2 Pointer to the first 80-bit value.
3526; @param A3 Pointer to the second 80-bit value.
3527;
3528%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3529BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3530 PROLOGUE_4_ARGS
3531 sub xSP, 20h
3532
3533 fninit
3534 fld tword [A3]
3535 fld tword [A2]
3536 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3537 %1 st1
3538
3539 fnstsw word [A1]
3540 pushf
3541 pop xAX
3542
3543 fninit
3544 add xSP, 20h
3545 EPILOGUE_4_ARGS
3546ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3547%endmacro
3548
3549IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3550IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3551
3552
3553;;
3554; FPU instruction working on one 80-bit floating point value.
3555;
3556; @param 1 The instruction
3557;
3558; @param A0 FPU context (fxsave).
3559; @param A1 Pointer to a IEMFPURESULT for the output.
3560; @param A2 Pointer to the 80-bit value.
3561;
3562%macro IEMIMPL_FPU_R80 1
3563BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3564 PROLOGUE_3_ARGS
3565 sub xSP, 20h
3566
3567 fninit
3568 fld tword [A2]
3569 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3570 %1
3571
3572 fnstsw word [A1 + IEMFPURESULT.FSW]
3573 fnclex
3574 fstp tword [A1 + IEMFPURESULT.r80Result]
3575
3576 fninit
3577 add xSP, 20h
3578 EPILOGUE_3_ARGS
3579ENDPROC iemAImpl_ %+ %1 %+ _r80
3580%endmacro
3581
3582IEMIMPL_FPU_R80 fchs
3583IEMIMPL_FPU_R80 fabs
3584IEMIMPL_FPU_R80 f2xm1
3585IEMIMPL_FPU_R80 fsqrt
3586IEMIMPL_FPU_R80 frndint
3587IEMIMPL_FPU_R80 fsin
3588IEMIMPL_FPU_R80 fcos
3589
3590
3591;;
3592; FPU instruction working on one 80-bit floating point value, only
3593; returning FSW.
3594;
3595; @param 1 The instruction
3596; @param 2 Non-zero to also restore FTW.
3597;
3598; @param A0 FPU context (fxsave).
3599; @param A1 Pointer to a uint16_t for the resulting FSW.
3600; @param A2 Pointer to the 80-bit value.
3601;
3602%macro IEMIMPL_FPU_R80_FSW 2
3603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3604 PROLOGUE_3_ARGS
3605 sub xSP, 20h
3606
3607 fninit
3608 fld tword [A2]
3609%if %2 != 0
3610 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3611%else
3612 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3613%endif
3614 %1
3615
3616 fnstsw word [A1]
3617
3618 fninit
3619 add xSP, 20h
3620 EPILOGUE_3_ARGS
3621ENDPROC iemAImpl_ %+ %1 %+ _r80
3622%endmacro
3623
3624IEMIMPL_FPU_R80_FSW ftst, 0
3625IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3626
3627
3628
3629;;
3630; FPU instruction loading a 80-bit floating point constant.
3631;
3632; @param 1 The instruction
3633;
3634; @param A0 FPU context (fxsave).
3635; @param A1 Pointer to a IEMFPURESULT for the output.
3636;
3637%macro IEMIMPL_FPU_R80_CONST 1
3638BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3639 PROLOGUE_2_ARGS
3640 sub xSP, 20h
3641
3642 fninit
3643 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3644 %1
3645
3646 fnstsw word [A1 + IEMFPURESULT.FSW]
3647 fnclex
3648 fstp tword [A1 + IEMFPURESULT.r80Result]
3649
3650 fninit
3651 add xSP, 20h
3652 EPILOGUE_2_ARGS
3653ENDPROC iemAImpl_ %+ %1 %+
3654%endmacro
3655
3656IEMIMPL_FPU_R80_CONST fld1
3657IEMIMPL_FPU_R80_CONST fldl2t
3658IEMIMPL_FPU_R80_CONST fldl2e
3659IEMIMPL_FPU_R80_CONST fldpi
3660IEMIMPL_FPU_R80_CONST fldlg2
3661IEMIMPL_FPU_R80_CONST fldln2
3662IEMIMPL_FPU_R80_CONST fldz
3663
3664
3665;;
3666; FPU instruction working on one 80-bit floating point value, outputing two.
3667;
3668; @param 1 The instruction
3669;
3670; @param A0 FPU context (fxsave).
3671; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3672; @param A2 Pointer to the 80-bit value.
3673;
3674%macro IEMIMPL_FPU_R80_R80 1
3675BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3676 PROLOGUE_3_ARGS
3677 sub xSP, 20h
3678
3679 fninit
3680 fld tword [A2]
3681 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3682 %1
3683
3684 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3685 fnclex
3686 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3687 fnclex
3688 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3689
3690 fninit
3691 add xSP, 20h
3692 EPILOGUE_3_ARGS
3693ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3694%endmacro
3695
3696IEMIMPL_FPU_R80_R80 fptan
3697IEMIMPL_FPU_R80_R80 fxtract
3698IEMIMPL_FPU_R80_R80 fsincos
3699
3700
3701
3702
3703;---------------------- SSE and MMX Operations ----------------------
3704
3705;; @todo what do we need to do for MMX?
3706%macro IEMIMPL_MMX_PROLOGUE 0
3707%endmacro
3708%macro IEMIMPL_MMX_EPILOGUE 0
3709%endmacro
3710
3711;; @todo what do we need to do for SSE?
3712%macro IEMIMPL_SSE_PROLOGUE 0
3713%endmacro
3714%macro IEMIMPL_SSE_EPILOGUE 0
3715%endmacro
3716
3717;; @todo what do we need to do for AVX?
3718%macro IEMIMPL_AVX_PROLOGUE 0
3719%endmacro
3720%macro IEMIMPL_AVX_EPILOGUE 0
3721%endmacro
3722
3723
3724;;
3725; Media instruction working on two full sized registers.
3726;
3727; @param 1 The instruction
3728; @param 2 Whether there is an MMX variant (1) or not (0).
3729;
3730; @param A0 FPU context (fxsave).
3731; @param A1 Pointer to the first media register size operand (input/output).
3732; @param A2 Pointer to the second media register size operand (input).
3733;
3734%macro IEMIMPL_MEDIA_F2 2
3735%if %2 != 0
3736BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3737 PROLOGUE_3_ARGS
3738 IEMIMPL_MMX_PROLOGUE
3739
3740 movq mm0, [A1]
3741 movq mm1, [A2]
3742 %1 mm0, mm1
3743 movq [A1], mm0
3744
3745 IEMIMPL_MMX_EPILOGUE
3746 EPILOGUE_3_ARGS
3747ENDPROC iemAImpl_ %+ %1 %+ _u64
3748%endif
3749
3750BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3751 PROLOGUE_3_ARGS
3752 IEMIMPL_SSE_PROLOGUE
3753
3754 movdqu xmm0, [A1]
3755 movdqu xmm1, [A2]
3756 %1 xmm0, xmm1
3757 movdqu [A1], xmm0
3758
3759 IEMIMPL_SSE_EPILOGUE
3760 EPILOGUE_3_ARGS
3761ENDPROC iemAImpl_ %+ %1 %+ _u128
3762%endmacro
3763
3764IEMIMPL_MEDIA_F2 pshufb, 1
3765IEMIMPL_MEDIA_F2 pand, 1
3766IEMIMPL_MEDIA_F2 pandn, 1
3767IEMIMPL_MEDIA_F2 por, 1
3768IEMIMPL_MEDIA_F2 pxor, 1
3769IEMIMPL_MEDIA_F2 pcmpeqb, 1
3770IEMIMPL_MEDIA_F2 pcmpeqw, 1
3771IEMIMPL_MEDIA_F2 pcmpeqd, 1
3772IEMIMPL_MEDIA_F2 pcmpeqq, 0
3773IEMIMPL_MEDIA_F2 pcmpgtb, 1
3774IEMIMPL_MEDIA_F2 pcmpgtw, 1
3775IEMIMPL_MEDIA_F2 pcmpgtd, 1
3776IEMIMPL_MEDIA_F2 pcmpgtq, 0
3777IEMIMPL_MEDIA_F2 paddb, 1
3778IEMIMPL_MEDIA_F2 paddw, 1
3779IEMIMPL_MEDIA_F2 paddd, 1
3780IEMIMPL_MEDIA_F2 paddq, 1
3781IEMIMPL_MEDIA_F2 paddsb, 1
3782IEMIMPL_MEDIA_F2 paddsw, 1
3783IEMIMPL_MEDIA_F2 paddusb, 1
3784IEMIMPL_MEDIA_F2 paddusw, 1
3785IEMIMPL_MEDIA_F2 psubb, 1
3786IEMIMPL_MEDIA_F2 psubw, 1
3787IEMIMPL_MEDIA_F2 psubd, 1
3788IEMIMPL_MEDIA_F2 psubq, 1
3789IEMIMPL_MEDIA_F2 psubsb, 1
3790IEMIMPL_MEDIA_F2 psubsw, 1
3791IEMIMPL_MEDIA_F2 psubusb, 1
3792IEMIMPL_MEDIA_F2 psubusw, 1
3793IEMIMPL_MEDIA_F2 pmullw, 1
3794IEMIMPL_MEDIA_F2 pmulld, 0
3795IEMIMPL_MEDIA_F2 pmulhw, 1
3796IEMIMPL_MEDIA_F2 pmaddwd, 1
3797IEMIMPL_MEDIA_F2 pminub, 1
3798IEMIMPL_MEDIA_F2 pminuw, 0
3799IEMIMPL_MEDIA_F2 pminud, 0
3800IEMIMPL_MEDIA_F2 pminsb, 0
3801IEMIMPL_MEDIA_F2 pminsw, 1
3802IEMIMPL_MEDIA_F2 pminsd, 0
3803IEMIMPL_MEDIA_F2 pmaxub, 1
3804IEMIMPL_MEDIA_F2 pmaxuw, 0
3805IEMIMPL_MEDIA_F2 pmaxud, 0
3806IEMIMPL_MEDIA_F2 pmaxsb, 0
3807IEMIMPL_MEDIA_F2 pmaxsw, 1
3808IEMIMPL_MEDIA_F2 pmaxsd, 0
3809IEMIMPL_MEDIA_F2 pabsb, 1
3810IEMIMPL_MEDIA_F2 pabsw, 1
3811IEMIMPL_MEDIA_F2 pabsd, 1
3812IEMIMPL_MEDIA_F2 psignb, 1
3813IEMIMPL_MEDIA_F2 psignw, 1
3814IEMIMPL_MEDIA_F2 psignd, 1
3815IEMIMPL_MEDIA_F2 phaddw, 1
3816IEMIMPL_MEDIA_F2 phaddd, 1
3817IEMIMPL_MEDIA_F2 phsubw, 1
3818IEMIMPL_MEDIA_F2 phsubd, 1
3819IEMIMPL_MEDIA_F2 phaddsw, 1
3820IEMIMPL_MEDIA_F2 phsubsw, 1
3821IEMIMPL_MEDIA_F2 pmaddubsw, 1
3822IEMIMPL_MEDIA_F2 pmulhrsw, 1
3823IEMIMPL_MEDIA_F2 pmuludq, 1
3824
3825
3826;;
3827; Media instruction working on two full sized registers, but no FXSAVE state argument.
3828;
3829; @param 1 The instruction
3830; @param 2 Whether there is an MMX variant (1) or not (0).
3831;
3832; @param A0 Pointer to the first media register size operand (input/output).
3833; @param A1 Pointer to the second media register size operand (input).
3834;
3835%macro IEMIMPL_MEDIA_OPT_F2 2
3836%if %2 != 0
3837BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3838 PROLOGUE_2_ARGS
3839 IEMIMPL_MMX_PROLOGUE
3840
3841 movq mm0, [A0]
3842 movq mm1, [A1]
3843 %1 mm0, mm1
3844 movq [A0], mm0
3845
3846 IEMIMPL_MMX_EPILOGUE
3847 EPILOGUE_2_ARGS
3848ENDPROC iemAImpl_ %+ %1 %+ _u64
3849%endif
3850
3851BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3852 PROLOGUE_2_ARGS
3853 IEMIMPL_SSE_PROLOGUE
3854
3855 movdqu xmm0, [A0]
3856 movdqu xmm1, [A1]
3857 %1 xmm0, xmm1
3858 movdqu [A0], xmm0
3859
3860 IEMIMPL_SSE_EPILOGUE
3861 EPILOGUE_2_ARGS
3862ENDPROC iemAImpl_ %+ %1 %+ _u128
3863%endmacro
3864
3865IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3866IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3867IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3868IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3869IEMIMPL_MEDIA_OPT_F2 psllw, 1
3870IEMIMPL_MEDIA_OPT_F2 pslld, 1
3871IEMIMPL_MEDIA_OPT_F2 psllq, 1
3872IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3873IEMIMPL_MEDIA_OPT_F2 psrld, 1
3874IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3875IEMIMPL_MEDIA_OPT_F2 psraw, 1
3876IEMIMPL_MEDIA_OPT_F2 psrad, 1
3877IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3878IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3879IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3880IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3881IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3882IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3883IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3884IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3885IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3886IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3887IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3888IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3889IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3890IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3891IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3892IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3893IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3894IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3895IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3896IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3897
3898;;
3899; Media instruction working on one full sized and one half sized register (lower half).
3900;
3901; @param 1 The instruction
3902; @param 2 1 if MMX is included, 0 if not.
3903;
3904; @param A0 Pointer to the first full sized media register operand (input/output).
3905; @param A1 Pointer to the second half sized media register operand (input).
3906;
3907%macro IEMIMPL_MEDIA_F1L1 2
3908 %if %2 != 0
3909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3910 PROLOGUE_2_ARGS
3911 IEMIMPL_MMX_PROLOGUE
3912
3913 movq mm0, [A0]
3914 movq mm1, [A1]
3915 %1 mm0, mm1
3916 movq [A0], mm0
3917
3918 IEMIMPL_MMX_EPILOGUE
3919 EPILOGUE_2_ARGS
3920ENDPROC iemAImpl_ %+ %1 %+ _u64
3921 %endif
3922
3923BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3924 PROLOGUE_2_ARGS
3925 IEMIMPL_SSE_PROLOGUE
3926
3927 movdqu xmm0, [A0]
3928 movdqu xmm1, [A1]
3929 %1 xmm0, xmm1
3930 movdqu [A0], xmm0
3931
3932 IEMIMPL_SSE_EPILOGUE
3933 EPILOGUE_2_ARGS
3934ENDPROC iemAImpl_ %+ %1 %+ _u128
3935%endmacro
3936
3937IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3938IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3939IEMIMPL_MEDIA_F1L1 punpckldq, 1
3940IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3941
3942
3943;;
3944; Media instruction working two half sized input registers (lower half) and a full sized
3945; destination register (vpunpckh*).
3946;
3947; @param 1 The instruction
3948;
3949; @param A0 Pointer to the destination register (full sized, output only).
3950; @param A1 Pointer to the first full sized media source register operand, where we
3951; will only use the lower half as input - but we'll be loading it in full.
3952; @param A2 Pointer to the second full sized media source register operand, where we
3953; will only use the lower half as input - but we'll be loading it in full.
3954;
3955%macro IEMIMPL_MEDIA_F1L1L1 1
3956BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3957 PROLOGUE_3_ARGS
3958 IEMIMPL_AVX_PROLOGUE
3959
3960 vmovdqu xmm0, [A1]
3961 vmovdqu xmm1, [A2]
3962 %1 xmm0, xmm0, xmm1
3963 vmovdqu [A0], xmm0
3964
3965 IEMIMPL_AVX_PROLOGUE
3966 EPILOGUE_3_ARGS
3967ENDPROC iemAImpl_ %+ %1 %+ _u128
3968
3969BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3970 PROLOGUE_3_ARGS
3971 IEMIMPL_AVX_PROLOGUE
3972
3973 vmovdqu ymm0, [A1]
3974 vmovdqu ymm1, [A2]
3975 %1 ymm0, ymm0, ymm1
3976 vmovdqu [A0], ymm0
3977
3978 IEMIMPL_AVX_PROLOGUE
3979 EPILOGUE_3_ARGS
3980ENDPROC iemAImpl_ %+ %1 %+ _u256
3981%endmacro
3982
3983IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3984IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3985IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3986IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3987
3988
3989;;
3990; Media instruction working on one full sized and one half sized register (high half).
3991;
3992; @param 1 The instruction
3993; @param 2 1 if MMX is included, 0 if not.
3994;
3995; @param A0 Pointer to the first full sized media register operand (input/output).
3996; @param A1 Pointer to the second full sized media register operand, where we
3997; will only use the upper half as input - but we'll load it in full.
3998;
3999%macro IEMIMPL_MEDIA_F1H1 2
4000IEMIMPL_MEDIA_F1L1 %1, %2
4001%endmacro
4002
4003IEMIMPL_MEDIA_F1L1 punpckhbw, 1
4004IEMIMPL_MEDIA_F1L1 punpckhwd, 1
4005IEMIMPL_MEDIA_F1L1 punpckhdq, 1
4006IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
4007
4008
4009;;
4010; Media instruction working two half sized input registers (high half) and a full sized
4011; destination register (vpunpckh*).
4012;
4013; @param 1 The instruction
4014;
4015; @param A0 Pointer to the destination register (full sized, output only).
4016; @param A1 Pointer to the first full sized media source register operand, where we
4017; will only use the upper half as input - but we'll be loading it in full.
4018; @param A2 Pointer to the second full sized media source register operand, where we
4019; will only use the upper half as input - but we'll be loading it in full.
4020;
4021%macro IEMIMPL_MEDIA_F1H1H1 1
4022IEMIMPL_MEDIA_F1L1L1 %1
4023%endmacro
4024
4025IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
4026IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
4027IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
4028IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
4029
4030
4031;
4032; Shufflers with evil 8-bit immediates.
4033;
4034
4035BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
4036 PROLOGUE_3_ARGS
4037 IEMIMPL_MMX_PROLOGUE
4038
4039 movzx A2, A2_8 ; must clear top bits
4040 movq mm1, [A1]
4041 movq mm0, mm0 ; paranoia!
4042 lea T1, [.imm0 xWrtRIP]
4043 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4044 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
4045 %else
4046 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
4047 %endif
4048 lea T1, [T1 + T0]
4049 IBT_NOTRACK
4050 call T1
4051 movq [A0], mm0
4052
4053 IEMIMPL_MMX_EPILOGUE
4054 EPILOGUE_3_ARGS
4055%assign bImm 0
4056%rep 256
4057.imm %+ bImm:
4058 IBT_ENDBRxx_WITHOUT_NOTRACK
4059 pshufw mm0, mm1, bImm
4060 ret
4061 %assign bImm bImm + 1
4062%endrep
4063.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4064ENDPROC iemAImpl_pshufw_u64
4065
4066
4067%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
4068BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4069 PROLOGUE_3_ARGS
4070 IEMIMPL_SSE_PROLOGUE
4071
4072 movzx A2, A2_8 ; must clear top bits
4073 movdqu xmm1, [A1]
4074 movdqu xmm0, xmm1 ; paranoia!
4075 lea T1, [.imm0 xWrtRIP]
4076 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4077 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4078 %else
4079 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4080 %endif
4081 lea T1, [T1 + T0*2]
4082 IBT_NOTRACK
4083 call T1
4084 movdqu [A0], xmm0
4085
4086 IEMIMPL_SSE_EPILOGUE
4087 EPILOGUE_3_ARGS
4088
4089 %assign bImm 0
4090 %rep 256
4091.imm %+ bImm:
4092 IBT_ENDBRxx_WITHOUT_NOTRACK
4093 %1 xmm0, xmm1, bImm
4094 ret
4095 %assign bImm bImm + 1
4096 %endrep
4097.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4098ENDPROC iemAImpl_ %+ %1 %+ _u128
4099%endmacro
4100
4101IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
4102IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
4103IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
4104
4105
4106%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
4107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4108 PROLOGUE_3_ARGS
4109 IEMIMPL_SSE_PROLOGUE
4110
4111 movzx A2, A2_8 ; must clear top bits
4112 vmovdqu ymm1, [A1]
4113 vmovdqu ymm0, ymm1 ; paranoia!
4114 lea T1, [.imm0 xWrtRIP]
4115 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4116 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4117 %else
4118 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4119 %endif
4120 lea T1, [T1 + T0*2]
4121 IBT_NOTRACK
4122 call T1
4123 vmovdqu [A0], ymm0
4124
4125 IEMIMPL_SSE_EPILOGUE
4126 EPILOGUE_3_ARGS
4127 %assign bImm 0
4128 %rep 256
4129.imm %+ bImm:
4130 IBT_ENDBRxx_WITHOUT_NOTRACK
4131 %1 ymm0, ymm1, bImm
4132 ret
4133 %assign bImm bImm + 1
4134 %endrep
4135.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4136ENDPROC iemAImpl_ %+ %1 %+ _u256
4137%endmacro
4138
4139IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4140IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4141IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4142
4143
4144;
4145; Shifts with evil 8-bit immediates.
4146;
4147
4148%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4149BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4150 PROLOGUE_2_ARGS
4151 IEMIMPL_MMX_PROLOGUE
4152
4153 movzx A1, A1_8 ; must clear top bits
4154 movq mm0, [A0]
4155 lea T1, [.imm0 xWrtRIP]
4156 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4157 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4158 %else
4159 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4160 %endif
4161 lea T1, [T1 + T0]
4162 IBT_NOTRACK
4163 call T1
4164 movq [A0], mm0
4165
4166 IEMIMPL_MMX_EPILOGUE
4167 EPILOGUE_2_ARGS
4168%assign bImm 0
4169%rep 256
4170.imm %+ bImm:
4171 IBT_ENDBRxx_WITHOUT_NOTRACK
4172 %1 mm0, bImm
4173 ret
4174 %assign bImm bImm + 1
4175%endrep
4176.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4177ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4178%endmacro
4179
4180IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4181IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4182IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4183IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4184IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4185IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4186IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4187IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4188
4189
4190%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4191BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4192 PROLOGUE_2_ARGS
4193 IEMIMPL_SSE_PROLOGUE
4194
4195 movzx A1, A1_8 ; must clear top bits
4196 movdqu xmm0, [A0]
4197 lea T1, [.imm0 xWrtRIP]
4198 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4199 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4200 %else
4201 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4202 %endif
4203 lea T1, [T1 + T0*2]
4204 IBT_NOTRACK
4205 call T1
4206 movdqu [A0], xmm0
4207
4208 IEMIMPL_SSE_EPILOGUE
4209 EPILOGUE_2_ARGS
4210 %assign bImm 0
4211 %rep 256
4212.imm %+ bImm:
4213 IBT_ENDBRxx_WITHOUT_NOTRACK
4214 %1 xmm0, bImm
4215 ret
4216 %assign bImm bImm + 1
4217 %endrep
4218.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4219ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4220%endmacro
4221
4222IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4223IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4224IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4225IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4226IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4227IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4228IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4229IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4230IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4231IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4232
4233
4234;
4235; Move byte mask.
4236;
4237
4238BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4239 PROLOGUE_2_ARGS
4240 IEMIMPL_MMX_PROLOGUE
4241
4242 movq mm1, [A1]
4243 pmovmskb T0, mm1
4244 mov [A0], T0
4245%ifdef RT_ARCH_X86
4246 mov dword [A0 + 4], 0
4247%endif
4248 IEMIMPL_MMX_EPILOGUE
4249 EPILOGUE_2_ARGS
4250ENDPROC iemAImpl_pmovmskb_u64
4251
4252BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4253 PROLOGUE_2_ARGS
4254 IEMIMPL_SSE_PROLOGUE
4255
4256 movdqu xmm1, [A1]
4257 pmovmskb T0, xmm1
4258 mov [A0], T0
4259%ifdef RT_ARCH_X86
4260 mov dword [A0 + 4], 0
4261%endif
4262 IEMIMPL_SSE_EPILOGUE
4263 EPILOGUE_2_ARGS
4264ENDPROC iemAImpl_pmovmskb_u128
4265
4266BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4267 PROLOGUE_2_ARGS
4268 IEMIMPL_AVX_PROLOGUE
4269
4270 vmovdqu ymm1, [A1]
4271 vpmovmskb T0, ymm1
4272 mov [A0], T0
4273%ifdef RT_ARCH_X86
4274 mov dword [A0 + 4], 0
4275%endif
4276 IEMIMPL_AVX_EPILOGUE
4277 EPILOGUE_2_ARGS
4278ENDPROC iemAImpl_vpmovmskb_u256
4279
4280
4281;;
4282; Media instruction working on two full sized source registers and one destination (AVX).
4283;
4284; @param 1 The instruction
4285;
4286; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4287; @param A1 Pointer to the destination media register size operand (output).
4288; @param A2 Pointer to the first source media register size operand (input).
4289; @param A3 Pointer to the second source media register size operand (input).
4290;
4291%macro IEMIMPL_MEDIA_F3 1
4292BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4293 PROLOGUE_4_ARGS
4294 IEMIMPL_AVX_PROLOGUE
4295
4296 vmovdqu xmm0, [A2]
4297 vmovdqu xmm1, [A3]
4298 %1 xmm0, xmm0, xmm1
4299 vmovdqu [A1], xmm0
4300
4301 IEMIMPL_AVX_PROLOGUE
4302 EPILOGUE_4_ARGS
4303ENDPROC iemAImpl_ %+ %1 %+ _u128
4304
4305BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4306 PROLOGUE_4_ARGS
4307 IEMIMPL_AVX_PROLOGUE
4308
4309 vmovdqu ymm0, [A2]
4310 vmovdqu ymm1, [A3]
4311 %1 ymm0, ymm0, ymm1
4312 vmovdqu [A1], ymm0
4313
4314 IEMIMPL_AVX_PROLOGUE
4315 EPILOGUE_4_ARGS
4316ENDPROC iemAImpl_ %+ %1 %+ _u256
4317%endmacro
4318
4319IEMIMPL_MEDIA_F3 vpshufb
4320IEMIMPL_MEDIA_F3 vpand
4321IEMIMPL_MEDIA_F3 vpminub
4322IEMIMPL_MEDIA_F3 vpminuw
4323IEMIMPL_MEDIA_F3 vpminud
4324IEMIMPL_MEDIA_F3 vpminsb
4325IEMIMPL_MEDIA_F3 vpminsw
4326IEMIMPL_MEDIA_F3 vpminsd
4327IEMIMPL_MEDIA_F3 vpmaxub
4328IEMIMPL_MEDIA_F3 vpmaxuw
4329IEMIMPL_MEDIA_F3 vpmaxud
4330IEMIMPL_MEDIA_F3 vpmaxsb
4331IEMIMPL_MEDIA_F3 vpmaxsw
4332IEMIMPL_MEDIA_F3 vpmaxsd
4333IEMIMPL_MEDIA_F3 vpandn
4334IEMIMPL_MEDIA_F3 vpor
4335IEMIMPL_MEDIA_F3 vpxor
4336IEMIMPL_MEDIA_F3 vpcmpeqb
4337IEMIMPL_MEDIA_F3 vpcmpeqw
4338IEMIMPL_MEDIA_F3 vpcmpeqd
4339IEMIMPL_MEDIA_F3 vpcmpeqq
4340IEMIMPL_MEDIA_F3 vpcmpgtb
4341IEMIMPL_MEDIA_F3 vpcmpgtw
4342IEMIMPL_MEDIA_F3 vpcmpgtd
4343IEMIMPL_MEDIA_F3 vpcmpgtq
4344IEMIMPL_MEDIA_F3 vpaddb
4345IEMIMPL_MEDIA_F3 vpaddw
4346IEMIMPL_MEDIA_F3 vpaddd
4347IEMIMPL_MEDIA_F3 vpaddq
4348IEMIMPL_MEDIA_F3 vpsubb
4349IEMIMPL_MEDIA_F3 vpsubw
4350IEMIMPL_MEDIA_F3 vpsubd
4351IEMIMPL_MEDIA_F3 vpsubq
4352
4353
4354;;
4355; Media instruction working on two full sized source registers and one destination (AVX),
4356; but no XSAVE state pointer argument.
4357;
4358; @param 1 The instruction
4359;
4360; @param A0 Pointer to the destination media register size operand (output).
4361; @param A1 Pointer to the first source media register size operand (input).
4362; @param A2 Pointer to the second source media register size operand (input).
4363;
4364%macro IEMIMPL_MEDIA_OPT_F3 1
4365BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4366 PROLOGUE_3_ARGS
4367 IEMIMPL_AVX_PROLOGUE
4368
4369 vmovdqu xmm0, [A1]
4370 vmovdqu xmm1, [A2]
4371 %1 xmm0, xmm0, xmm1
4372 vmovdqu [A0], xmm0
4373
4374 IEMIMPL_AVX_PROLOGUE
4375 EPILOGUE_3_ARGS
4376ENDPROC iemAImpl_ %+ %1 %+ _u128
4377
4378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4379 PROLOGUE_3_ARGS
4380 IEMIMPL_AVX_PROLOGUE
4381
4382 vmovdqu ymm0, [A1]
4383 vmovdqu ymm1, [A2]
4384 %1 ymm0, ymm0, ymm1
4385 vmovdqu [A0], ymm0
4386
4387 IEMIMPL_AVX_PROLOGUE
4388 EPILOGUE_3_ARGS
4389ENDPROC iemAImpl_ %+ %1 %+ _u256
4390%endmacro
4391
4392IEMIMPL_MEDIA_OPT_F3 vpacksswb
4393IEMIMPL_MEDIA_OPT_F3 vpackssdw
4394IEMIMPL_MEDIA_OPT_F3 vpackuswb
4395IEMIMPL_MEDIA_OPT_F3 vpackusdw
4396IEMIMPL_MEDIA_OPT_F3 vpmullw
4397IEMIMPL_MEDIA_OPT_F3 vpmulld
4398IEMIMPL_MEDIA_OPT_F3 vpmulhw
4399IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4400IEMIMPL_MEDIA_OPT_F3 vpavgb
4401IEMIMPL_MEDIA_OPT_F3 vpavgw
4402IEMIMPL_MEDIA_OPT_F3 vpsignb
4403IEMIMPL_MEDIA_OPT_F3 vpsignw
4404IEMIMPL_MEDIA_OPT_F3 vpsignd
4405IEMIMPL_MEDIA_OPT_F3 vphaddw
4406IEMIMPL_MEDIA_OPT_F3 vphaddd
4407IEMIMPL_MEDIA_OPT_F3 vphsubw
4408IEMIMPL_MEDIA_OPT_F3 vphsubd
4409IEMIMPL_MEDIA_OPT_F3 vphaddsw
4410IEMIMPL_MEDIA_OPT_F3 vphsubsw
4411IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4412IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4413IEMIMPL_MEDIA_OPT_F3 vpsadbw
4414IEMIMPL_MEDIA_OPT_F3 vpmuldq
4415IEMIMPL_MEDIA_OPT_F3 vpmuludq
4416IEMIMPL_MEDIA_OPT_F3 vunpcklps
4417IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4418IEMIMPL_MEDIA_OPT_F3 vunpckhps
4419IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4420IEMIMPL_MEDIA_OPT_F3 vpsubsb
4421IEMIMPL_MEDIA_OPT_F3 vpsubsw
4422IEMIMPL_MEDIA_OPT_F3 vpsubusb
4423IEMIMPL_MEDIA_OPT_F3 vpsubusw
4424IEMIMPL_MEDIA_OPT_F3 vpaddusb
4425IEMIMPL_MEDIA_OPT_F3 vpaddusw
4426IEMIMPL_MEDIA_OPT_F3 vpaddsb
4427IEMIMPL_MEDIA_OPT_F3 vpaddsw
4428IEMIMPL_MEDIA_OPT_F3 vpermilps
4429IEMIMPL_MEDIA_OPT_F3 vpermilpd
4430IEMIMPL_MEDIA_OPT_F3 vpmaddwd
4431
4432;;
4433; Media instruction working on one full sized source register, one full sized destination
4434; register, and one no-larger-than-XMM register (in the vps{ll,ra,rl}[dwq] instructions,
4435; this is actually used to retrieve a 128-bit load, from which a 64-bit shift length is
4436; extracted; if the 64-bit unsigned value is larger than the permissible max shift size
4437; of either 16, 32, or 64, it acts like the max shift size)
4438;
4439; @param 1 The instruction
4440;
4441; @param A0 Pointer to the destination media register size operand (output).
4442; @param A1 Pointer to the first source media register size operand (input).
4443; @param A2 Pointer to the second source media register size operand (input).
4444;
4445%macro IEMIMPL_SHIFT_OPT_F3 1
4446BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4447 PROLOGUE_3_ARGS
4448 IEMIMPL_AVX_PROLOGUE
4449
4450 vmovdqu xmm0, [A1]
4451 vmovdqu xmm1, [A2]
4452 %1 xmm0, xmm0, xmm1
4453 vmovdqu [A0], xmm0
4454
4455 IEMIMPL_AVX_PROLOGUE
4456 EPILOGUE_3_ARGS
4457ENDPROC iemAImpl_ %+ %1 %+ _u128
4458
4459BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4460 PROLOGUE_3_ARGS
4461 IEMIMPL_AVX_PROLOGUE
4462
4463 vmovdqu ymm0, [A1]
4464 vmovdqu xmm1, [A2]
4465 %1 ymm0, ymm0, xmm1
4466 vmovdqu [A0], ymm0
4467
4468 IEMIMPL_AVX_PROLOGUE
4469 EPILOGUE_3_ARGS
4470ENDPROC iemAImpl_ %+ %1 %+ _u256
4471%endmacro
4472
4473IEMIMPL_SHIFT_OPT_F3 vpsllw
4474IEMIMPL_SHIFT_OPT_F3 vpslld
4475IEMIMPL_SHIFT_OPT_F3 vpsllq
4476IEMIMPL_SHIFT_OPT_F3 vpsraw
4477IEMIMPL_SHIFT_OPT_F3 vpsrad
4478IEMIMPL_SHIFT_OPT_F3 vpsrlw
4479IEMIMPL_SHIFT_OPT_F3 vpsrld
4480IEMIMPL_SHIFT_OPT_F3 vpsrlq
4481
4482
4483;;
4484; Media instruction working on one full sized source registers and one destination (AVX),
4485; but no XSAVE state pointer argument.
4486;
4487; @param 1 The instruction
4488; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4489;
4490; @param A0 Pointer to the destination media register size operand (output).
4491; @param A1 Pointer to the source media register size operand (input).
4492;
4493%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4494BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4495 PROLOGUE_2_ARGS
4496 IEMIMPL_AVX_PROLOGUE
4497
4498 vmovdqu xmm0, [A1]
4499 %1 xmm0, xmm0
4500 vmovdqu [A0], xmm0
4501
4502 IEMIMPL_AVX_PROLOGUE
4503 EPILOGUE_2_ARGS
4504ENDPROC iemAImpl_ %+ %1 %+ _u128
4505
4506 %if %2 == 1
4507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4508 PROLOGUE_2_ARGS
4509 IEMIMPL_AVX_PROLOGUE
4510
4511 vmovdqu ymm0, [A1]
4512 %1 ymm0, ymm0
4513 vmovdqu [A0], ymm0
4514
4515 IEMIMPL_AVX_PROLOGUE
4516 EPILOGUE_2_ARGS
4517ENDPROC iemAImpl_ %+ %1 %+ _u256
4518 %endif
4519%endmacro
4520
4521IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4522IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4523IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4524IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4525
4526
4527;
4528; The SSE 4.2 crc32
4529;
4530; @param A1 Pointer to the 32-bit destination.
4531; @param A2 The source operand, sized according to the suffix.
4532;
4533BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4534 PROLOGUE_2_ARGS
4535
4536 mov T0_32, [A0]
4537 crc32 T0_32, A1_8
4538 mov [A0], T0_32
4539
4540 EPILOGUE_2_ARGS
4541ENDPROC iemAImpl_crc32_u8
4542
4543BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4544 PROLOGUE_2_ARGS
4545
4546 mov T0_32, [A0]
4547 crc32 T0_32, A1_16
4548 mov [A0], T0_32
4549
4550 EPILOGUE_2_ARGS
4551ENDPROC iemAImpl_crc32_u16
4552
4553BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4554 PROLOGUE_2_ARGS
4555
4556 mov T0_32, [A0]
4557 crc32 T0_32, A1_32
4558 mov [A0], T0_32
4559
4560 EPILOGUE_2_ARGS
4561ENDPROC iemAImpl_crc32_u32
4562
4563%ifdef RT_ARCH_AMD64
4564BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4565 PROLOGUE_2_ARGS
4566
4567 mov T0_32, [A0]
4568 crc32 T0, A1
4569 mov [A0], T0_32
4570
4571 EPILOGUE_2_ARGS
4572ENDPROC iemAImpl_crc32_u64
4573%endif
4574
4575
4576;
4577; PTEST (SSE 4.1)
4578;
4579; @param A0 Pointer to the first source operand (aka readonly destination).
4580; @param A1 Pointer to the second source operand.
4581; @param A2 Pointer to the EFLAGS register.
4582;
4583BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4584 PROLOGUE_3_ARGS
4585 IEMIMPL_SSE_PROLOGUE
4586
4587 movdqu xmm0, [A0]
4588 movdqu xmm1, [A1]
4589 ptest xmm0, xmm1
4590 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4591
4592 IEMIMPL_SSE_EPILOGUE
4593 EPILOGUE_3_ARGS
4594ENDPROC iemAImpl_ptest_u128
4595
4596BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4597 PROLOGUE_3_ARGS
4598 IEMIMPL_SSE_PROLOGUE
4599
4600 vmovdqu ymm0, [A0]
4601 vmovdqu ymm1, [A1]
4602 vptest ymm0, ymm1
4603 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4604
4605 IEMIMPL_SSE_EPILOGUE
4606 EPILOGUE_3_ARGS
4607ENDPROC iemAImpl_vptest_u256
4608
4609
4610;;
4611; Template for the [v]pmov{s,z}x* instructions
4612;
4613; @param 1 The instruction
4614;
4615; @param A0 Pointer to the destination media register size operand (output).
4616; @param A1 The source operand value (input).
4617;
4618%macro IEMIMPL_V_PMOV_SZ_X 1
4619BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4620 PROLOGUE_2_ARGS
4621 IEMIMPL_SSE_PROLOGUE
4622
4623 movd xmm0, A1
4624 %1 xmm0, xmm0
4625 vmovdqu [A0], xmm0
4626
4627 IEMIMPL_SSE_PROLOGUE
4628 EPILOGUE_2_ARGS
4629ENDPROC iemAImpl_ %+ %1 %+ _u128
4630
4631BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4632 PROLOGUE_2_ARGS
4633 IEMIMPL_AVX_PROLOGUE
4634
4635 movd xmm0, A1
4636 v %+ %1 xmm0, xmm0
4637 vmovdqu [A0], xmm0
4638
4639 IEMIMPL_AVX_PROLOGUE
4640 EPILOGUE_2_ARGS
4641ENDPROC iemAImpl_v %+ %1 %+ _u128
4642
4643BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4644 PROLOGUE_2_ARGS
4645 IEMIMPL_AVX_PROLOGUE
4646
4647 movdqu xmm0, [A1]
4648 v %+ %1 ymm0, xmm0
4649 vmovdqu [A0], ymm0
4650
4651 IEMIMPL_AVX_PROLOGUE
4652 EPILOGUE_2_ARGS
4653ENDPROC iemAImpl_v %+ %1 %+ _u256
4654%endmacro
4655
4656IEMIMPL_V_PMOV_SZ_X pmovsxbw
4657IEMIMPL_V_PMOV_SZ_X pmovsxbd
4658IEMIMPL_V_PMOV_SZ_X pmovsxbq
4659IEMIMPL_V_PMOV_SZ_X pmovsxwd
4660IEMIMPL_V_PMOV_SZ_X pmovsxwq
4661IEMIMPL_V_PMOV_SZ_X pmovsxdq
4662
4663IEMIMPL_V_PMOV_SZ_X pmovzxbw
4664IEMIMPL_V_PMOV_SZ_X pmovzxbd
4665IEMIMPL_V_PMOV_SZ_X pmovzxbq
4666IEMIMPL_V_PMOV_SZ_X pmovzxwd
4667IEMIMPL_V_PMOV_SZ_X pmovzxwq
4668IEMIMPL_V_PMOV_SZ_X pmovzxdq
4669
4670
4671;;
4672; Need to move this as well somewhere better?
4673;
4674struc IEMSSERESULT
4675 .uResult resd 4
4676 .MXCSR resd 1
4677endstruc
4678
4679
4680;;
4681; Need to move this as well somewhere better?
4682;
4683struc IEMAVX128RESULT
4684 .uResult resd 4
4685 .MXCSR resd 1
4686endstruc
4687
4688
4689;;
4690; Need to move this as well somewhere better?
4691;
4692struc IEMAVX256RESULT
4693 .uResult resd 8
4694 .MXCSR resd 1
4695endstruc
4696
4697
4698;;
4699; Initialize the SSE MXCSR register using the guest value partially to
4700; account for rounding mode.
4701;
4702; @uses 4 bytes of stack to save the original value, T0.
4703; @param 1 Expression giving the address of the FXSTATE of the guest.
4704;
4705%macro SSE_LD_FXSTATE_MXCSR 1
4706 sub xSP, 4
4707
4708 stmxcsr [xSP]
4709 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4710 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4711 or T0_32, X86_MXCSR_XCPT_MASK
4712 sub xSP, 4
4713 mov [xSP], T0_32
4714 ldmxcsr [xSP]
4715 add xSP, 4
4716%endmacro
4717
4718
4719;;
4720; Restores the SSE MXCSR register with the original value.
4721;
4722; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4723; @param 1 Expression giving the address where to return the MXCSR value.
4724; @param 2 Expression giving the address of the FXSTATE of the guest.
4725;
4726; @note Restores the stack pointer.
4727;
4728%macro SSE_ST_FXSTATE_MXCSR 2
4729 sub xSP, 4
4730 stmxcsr [xSP]
4731 mov T0_32, [xSP]
4732 add xSP, 4
4733 ; Merge the status bits into the original MXCSR value.
4734 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4735 and T0_32, X86_MXCSR_XCPT_FLAGS
4736 or T0_32, T1_32
4737 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4738
4739 ldmxcsr [xSP]
4740 add xSP, 4
4741%endmacro
4742
4743
4744;;
4745; Initialize the SSE MXCSR register using the guest value partially to
4746; account for rounding mode.
4747;
4748; @uses 4 bytes of stack to save the original value.
4749; @param 1 Expression giving the address of the FXSTATE of the guest.
4750;
4751%macro AVX_LD_XSAVEAREA_MXCSR 1
4752 sub xSP, 4
4753
4754 stmxcsr [xSP]
4755 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4756 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4757 sub xSP, 4
4758 mov [xSP], T0_32
4759 ldmxcsr [xSP]
4760 add xSP, 4
4761%endmacro
4762
4763
4764;;
4765; Restores the AVX128 MXCSR register with the original value.
4766;
4767; @param 1 Expression giving the address where to return the MXCSR value.
4768;
4769; @note Restores the stack pointer.
4770;
4771%macro AVX128_ST_XSAVEAREA_MXCSR 1
4772 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4773
4774 ldmxcsr [xSP]
4775 add xSP, 4
4776%endmacro
4777
4778
4779;;
4780; Restores the AVX256 MXCSR register with the original value.
4781;
4782; @param 1 Expression giving the address where to return the MXCSR value.
4783;
4784; @note Restores the stack pointer.
4785;
4786%macro AVX256_ST_XSAVEAREA_MXCSR 1
4787 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4788
4789 ldmxcsr [xSP]
4790 add xSP, 4
4791%endmacro
4792
4793
4794;;
4795; Floating point instruction working on two full sized registers.
4796;
4797; @param 1 The instruction
4798; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4799;
4800; @param A0 FPU context (FXSTATE or XSAVEAREA).
4801; @param A1 Where to return the result including the MXCSR value.
4802; @param A2 Pointer to the first media register size operand (input/output).
4803; @param A3 Pointer to the second media register size operand (input).
4804;
4805%macro IEMIMPL_FP_F2 2
4806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4807 PROLOGUE_4_ARGS
4808 IEMIMPL_SSE_PROLOGUE
4809 SSE_LD_FXSTATE_MXCSR A0
4810
4811 movdqu xmm0, [A2]
4812 movdqu xmm1, [A3]
4813 %1 xmm0, xmm1
4814 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4815
4816 SSE_ST_FXSTATE_MXCSR A1, A0
4817 IEMIMPL_SSE_PROLOGUE
4818 EPILOGUE_4_ARGS
4819ENDPROC iemAImpl_ %+ %1 %+ _u128
4820
4821 %if %2 == 3
4822BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4823 PROLOGUE_4_ARGS
4824 IEMIMPL_AVX_PROLOGUE
4825 AVX_LD_XSAVEAREA_MXCSR A0
4826
4827 vmovdqu xmm0, [A2]
4828 vmovdqu xmm1, [A3]
4829 v %+ %1 xmm0, xmm0, xmm1
4830 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4831
4832 AVX128_ST_XSAVEAREA_MXCSR A1
4833 IEMIMPL_AVX_PROLOGUE
4834 EPILOGUE_4_ARGS
4835ENDPROC iemAImpl_v %+ %1 %+ _u128
4836
4837BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4838 PROLOGUE_4_ARGS
4839 IEMIMPL_AVX_PROLOGUE
4840 AVX_LD_XSAVEAREA_MXCSR A0
4841
4842 vmovdqu ymm0, [A2]
4843 vmovdqu ymm1, [A3]
4844 v %+ %1 ymm0, ymm0, ymm1
4845 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4846
4847 AVX256_ST_XSAVEAREA_MXCSR A1
4848 IEMIMPL_AVX_PROLOGUE
4849 EPILOGUE_4_ARGS
4850ENDPROC iemAImpl_v %+ %1 %+ _u256
4851 %elif %2 == 2
4852BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4853 PROLOGUE_4_ARGS
4854 IEMIMPL_AVX_PROLOGUE
4855 AVX_LD_XSAVEAREA_MXCSR A0
4856
4857 vmovdqu xmm0, [A2]
4858 vmovdqu xmm1, [A3]
4859 v %+ %1 xmm0, xmm1
4860 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4861
4862 AVX128_ST_XSAVEAREA_MXCSR A1
4863 IEMIMPL_AVX_PROLOGUE
4864 EPILOGUE_4_ARGS
4865ENDPROC iemAImpl_v %+ %1 %+ _u128
4866
4867BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4868 PROLOGUE_4_ARGS
4869 IEMIMPL_AVX_PROLOGUE
4870 AVX_LD_XSAVEAREA_MXCSR A0
4871
4872 vmovdqu ymm0, [A2]
4873 vmovdqu ymm1, [A3]
4874 v %+ %1 ymm0, ymm1
4875 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4876
4877 AVX256_ST_XSAVEAREA_MXCSR A1
4878 IEMIMPL_AVX_PROLOGUE
4879 EPILOGUE_4_ARGS
4880ENDPROC iemAImpl_v %+ %1 %+ _u256
4881 %endif
4882%endmacro
4883
4884IEMIMPL_FP_F2 addps, 3
4885IEMIMPL_FP_F2 addpd, 3
4886IEMIMPL_FP_F2 mulps, 3
4887IEMIMPL_FP_F2 mulpd, 3
4888IEMIMPL_FP_F2 subps, 3
4889IEMIMPL_FP_F2 subpd, 3
4890IEMIMPL_FP_F2 minps, 3
4891IEMIMPL_FP_F2 minpd, 3
4892IEMIMPL_FP_F2 divps, 3
4893IEMIMPL_FP_F2 divpd, 3
4894IEMIMPL_FP_F2 maxps, 3
4895IEMIMPL_FP_F2 maxpd, 3
4896IEMIMPL_FP_F2 haddps, 3
4897IEMIMPL_FP_F2 haddpd, 3
4898IEMIMPL_FP_F2 hsubps, 3
4899IEMIMPL_FP_F2 hsubpd, 3
4900IEMIMPL_FP_F2 addsubps, 3
4901IEMIMPL_FP_F2 addsubpd, 3
4902
4903
4904;;
4905; These are actually unary operations but to keep it simple
4906; we treat them as binary for now, so the output result is
4907; always in sync with the register where the result might get written
4908; to.
4909IEMIMPL_FP_F2 sqrtps, 2
4910IEMIMPL_FP_F2 rsqrtps, 2
4911IEMIMPL_FP_F2 sqrtpd, 2
4912IEMIMPL_FP_F2 rcpps, 2
4913IEMIMPL_FP_F2 cvtdq2ps, 2
4914IEMIMPL_FP_F2 cvtps2dq, 2
4915IEMIMPL_FP_F2 cvttps2dq, 2
4916IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4917IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4918IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4919
4920
4921;;
4922; Floating point instruction working on a full sized register and a single precision operand.
4923;
4924; @param 1 The instruction
4925;
4926; @param A0 FPU context (FXSTATE or XSAVEAREA).
4927; @param A1 Where to return the result including the MXCSR value.
4928; @param A2 Pointer to the first media register size operand (input/output).
4929; @param A3 Pointer to the second single precision floating point value (input).
4930;
4931%macro IEMIMPL_FP_F2_R32 1
4932BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4933 PROLOGUE_4_ARGS
4934 IEMIMPL_SSE_PROLOGUE
4935 SSE_LD_FXSTATE_MXCSR A0
4936
4937 movdqu xmm0, [A2]
4938 movd xmm1, [A3]
4939 %1 xmm0, xmm1
4940 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4941
4942 SSE_ST_FXSTATE_MXCSR A1, A0
4943 IEMIMPL_SSE_EPILOGUE
4944 EPILOGUE_4_ARGS
4945ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4946
4947BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4948 PROLOGUE_4_ARGS
4949 IEMIMPL_AVX_PROLOGUE
4950 AVX_LD_XSAVEAREA_MXCSR A0
4951
4952 vmovdqu xmm0, [A2]
4953 vmovd xmm1, [A3]
4954 v %+ %1 xmm0, xmm0, xmm1
4955 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4956
4957 AVX128_ST_XSAVEAREA_MXCSR A1
4958 IEMIMPL_AVX_PROLOGUE
4959 EPILOGUE_4_ARGS
4960ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4961%endmacro
4962
4963IEMIMPL_FP_F2_R32 addss
4964IEMIMPL_FP_F2_R32 mulss
4965IEMIMPL_FP_F2_R32 subss
4966IEMIMPL_FP_F2_R32 minss
4967IEMIMPL_FP_F2_R32 divss
4968IEMIMPL_FP_F2_R32 maxss
4969IEMIMPL_FP_F2_R32 cvtss2sd
4970IEMIMPL_FP_F2_R32 sqrtss
4971IEMIMPL_FP_F2_R32 rsqrtss
4972IEMIMPL_FP_F2_R32 rcpss
4973
4974
4975;;
4976; Floating point instruction working on a full sized register and a double precision operand.
4977;
4978; @param 1 The instruction
4979;
4980; @param A0 FPU context (FXSTATE or XSAVEAREA).
4981; @param A1 Where to return the result including the MXCSR value.
4982; @param A2 Pointer to the first media register size operand (input/output).
4983; @param A3 Pointer to the second double precision floating point value (input).
4984;
4985%macro IEMIMPL_FP_F2_R64 1
4986BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4987 PROLOGUE_4_ARGS
4988 IEMIMPL_SSE_PROLOGUE
4989 SSE_LD_FXSTATE_MXCSR A0
4990
4991 movdqu xmm0, [A2]
4992 movq xmm1, [A3]
4993 %1 xmm0, xmm1
4994 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4995
4996 SSE_ST_FXSTATE_MXCSR A1, A0
4997 IEMIMPL_SSE_EPILOGUE
4998 EPILOGUE_4_ARGS
4999ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
5000
5001BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
5002 PROLOGUE_4_ARGS
5003 IEMIMPL_AVX_PROLOGUE
5004 AVX_LD_XSAVEAREA_MXCSR A0
5005
5006 vmovdqu xmm0, [A2]
5007 vmovq xmm1, [A3]
5008 v %+ %1 xmm0, xmm0, xmm1
5009 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5010
5011 AVX128_ST_XSAVEAREA_MXCSR A1
5012 IEMIMPL_AVX_EPILOGUE
5013 EPILOGUE_4_ARGS
5014ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
5015%endmacro
5016
5017IEMIMPL_FP_F2_R64 addsd
5018IEMIMPL_FP_F2_R64 mulsd
5019IEMIMPL_FP_F2_R64 subsd
5020IEMIMPL_FP_F2_R64 minsd
5021IEMIMPL_FP_F2_R64 divsd
5022IEMIMPL_FP_F2_R64 maxsd
5023IEMIMPL_FP_F2_R64 cvtsd2ss
5024IEMIMPL_FP_F2_R64 sqrtsd
5025
5026
5027;;
5028; Macro for the cvtpd2ps/cvtps2pd instructions.
5029;
5030; 1 The instruction name.
5031; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
5032;
5033; @param A0 FPU context (FXSTATE or XSAVEAREA).
5034; @param A1 Where to return the result including the MXCSR value.
5035; @param A2 Pointer to the first media register size operand (input/output).
5036; @param A3 Pointer to the second media register size operand (input).
5037;
5038%macro IEMIMPL_CVT_F2 2
5039BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5040 PROLOGUE_4_ARGS
5041 IEMIMPL_SSE_PROLOGUE
5042 SSE_LD_FXSTATE_MXCSR A0
5043
5044 movdqu xmm0, [A2]
5045 movdqu xmm1, [A3]
5046 %1 xmm0, xmm1
5047 movdqu [A1 + IEMSSERESULT.uResult], xmm0
5048
5049 SSE_ST_FXSTATE_MXCSR A1, A0
5050 IEMIMPL_SSE_EPILOGUE
5051 EPILOGUE_4_ARGS
5052ENDPROC iemAImpl_ %+ %1 %+ _u128
5053
5054BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
5055 PROLOGUE_4_ARGS
5056 IEMIMPL_AVX_PROLOGUE
5057 AVX_LD_XSAVEAREA_MXCSR A0
5058
5059 vmovdqu xmm0, [A2]
5060 vmovdqu xmm1, [A3]
5061 v %+ %1 xmm0, xmm1
5062 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
5063
5064 AVX128_ST_XSAVEAREA_MXCSR A1
5065 IEMIMPL_AVX_EPILOGUE
5066 EPILOGUE_4_ARGS
5067ENDPROC iemAImpl_v %+ %1 %+ _u128
5068
5069BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
5070 PROLOGUE_4_ARGS
5071 IEMIMPL_AVX_PROLOGUE
5072 AVX_LD_XSAVEAREA_MXCSR A0
5073
5074 vmovdqu ymm0, [A2]
5075 vmovdqu ymm1, [A3]
5076 %if %2 == 0
5077 v %+ %1 xmm0, ymm1
5078 %else
5079 v %+ %1 ymm0, xmm1
5080 %endif
5081 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
5082
5083 AVX256_ST_XSAVEAREA_MXCSR A1
5084 IEMIMPL_AVX_EPILOGUE
5085 EPILOGUE_4_ARGS
5086ENDPROC iemAImpl_v %+ %1 %+ _u256
5087%endmacro
5088
5089IEMIMPL_CVT_F2 cvtpd2ps, 0
5090IEMIMPL_CVT_F2 cvtps2pd, 1
5091
5092
5093;;
5094; shufps instructions with 8-bit immediates.
5095;
5096; @param A0 Pointer to the destination media register size operand (input/output).
5097; @param A1 Pointer to the first source media register size operand (input).
5098; @param A2 The 8-bit immediate
5099;
5100BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
5101 PROLOGUE_3_ARGS
5102 IEMIMPL_SSE_PROLOGUE
5103
5104 movzx A2, A2_8 ; must clear top bits
5105 movdqu xmm0, [A0]
5106 movdqu xmm1, [A1]
5107 lea T1, [.imm0 xWrtRIP]
5108 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5109 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
5110 %else
5111 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
5112 %endif
5113 lea T1, [T1 + T0*2]
5114 IBT_NOTRACK
5115 call T1
5116 movdqu [A0], xmm0
5117
5118 IEMIMPL_SSE_EPILOGUE
5119 EPILOGUE_3_ARGS
5120 %assign bImm 0
5121 %rep 256
5122.imm %+ bImm:
5123 IBT_ENDBRxx_WITHOUT_NOTRACK
5124 shufps xmm0, xmm1, bImm
5125 ret
5126 int3
5127 %assign bImm bImm + 1
5128 %endrep
5129.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5130ENDPROC iemAImpl_shufps_u128
5131
5132
5133;;
5134; shufpd instruction with 8-bit immediates.
5135;
5136; @param A0 Pointer to the destination media register size operand (input/output).
5137; @param A1 Pointer to the first source media register size operand (input).
5138; @param A2 The 8-bit immediate
5139;
5140BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
5141 PROLOGUE_3_ARGS
5142 IEMIMPL_SSE_PROLOGUE
5143
5144 movzx A2, A2_8 ; must clear top bits
5145 movdqu xmm0, [A0]
5146 movdqu xmm1, [A1]
5147 lea T1, [.imm0 xWrtRIP]
5148 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5149 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
5150 %else
5151 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
5152 %endif
5153 lea T1, [T1 + T0*2]
5154 IBT_NOTRACK
5155 call T1
5156 movdqu [A0], xmm0
5157
5158 IEMIMPL_SSE_EPILOGUE
5159 EPILOGUE_3_ARGS
5160 %assign bImm 0
5161 %rep 256
5162.imm %+ bImm:
5163 IBT_ENDBRxx_WITHOUT_NOTRACK
5164 shufpd xmm0, xmm1, bImm
5165 ret
5166 %assign bImm bImm + 1
5167 %endrep
5168.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5169ENDPROC iemAImpl_shufpd_u128
5170
5171
5172;;
5173; vshufp{s,d} instructions with 8-bit immediates.
5174;
5175; @param 1 The instruction name.
5176;
5177; @param A0 Pointer to the destination media register size operand (output).
5178; @param A1 Pointer to the first source media register size operand (input).
5179; @param A2 Pointer to the second source media register size operand (input).
5180; @param A3 The 8-bit immediate
5181;
5182%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5183BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5184 PROLOGUE_4_ARGS
5185 IEMIMPL_AVX_PROLOGUE
5186
5187 movzx A3, A3_8 ; must clear top bits
5188 movdqu xmm0, [A1]
5189 movdqu xmm1, [A2]
5190 lea T1, [.imm0 xWrtRIP]
5191 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5192 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5193 %else
5194 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5195 %endif
5196 lea T1, [T1 + T0*2]
5197 IBT_NOTRACK
5198 call T1
5199 movdqu [A0], xmm0
5200
5201 IEMIMPL_AVX_EPILOGUE
5202 EPILOGUE_4_ARGS
5203 %assign bImm 0
5204 %rep 256
5205.imm %+ bImm:
5206 IBT_ENDBRxx_WITHOUT_NOTRACK
5207 %1 xmm0, xmm0, xmm1, bImm
5208 ret
5209 %assign bImm bImm + 1
5210 %endrep
5211.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5212ENDPROC iemAImpl_ %+ %1 %+ _u128
5213
5214BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5215 PROLOGUE_4_ARGS
5216 IEMIMPL_AVX_PROLOGUE
5217
5218 movzx A3, A3_8 ; must clear top bits
5219 vmovdqu ymm0, [A1]
5220 vmovdqu ymm1, [A2]
5221 lea T1, [.imm0 xWrtRIP]
5222 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5223 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5224 %else
5225 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5226 %endif
5227 lea T1, [T1 + T0*2]
5228 IBT_NOTRACK
5229 call T1
5230 vmovdqu [A0], ymm0
5231
5232 IEMIMPL_AVX_EPILOGUE
5233 EPILOGUE_4_ARGS
5234 %assign bImm 0
5235 %rep 256
5236.imm %+ bImm:
5237 IBT_ENDBRxx_WITHOUT_NOTRACK
5238 %1 ymm0, ymm0, ymm1, bImm
5239 ret
5240 %assign bImm bImm + 1
5241 %endrep
5242.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5243ENDPROC iemAImpl_ %+ %1 %+ _u256
5244%endmacro
5245
5246IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5247IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5248
5249
5250;;
5251; One of the [p]blendv{b,ps,pd} variants
5252;
5253; @param 1 The instruction
5254;
5255; @param A0 Pointer to the first media register sized operand (input/output).
5256; @param A1 Pointer to the second media sized value (input).
5257; @param A2 Pointer to the media register sized mask value (input).
5258;
5259%macro IEMIMPL_P_BLEND 1
5260BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5261 PROLOGUE_3_ARGS
5262 IEMIMPL_SSE_PROLOGUE
5263
5264 movdqu xmm0, [A2] ; This is implicit
5265 movdqu xmm1, [A0]
5266 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5267 %1 xmm1, xmm2
5268 movdqu [A0], xmm1
5269
5270 IEMIMPL_SSE_PROLOGUE
5271 EPILOGUE_3_ARGS
5272ENDPROC iemAImpl_ %+ %1 %+ _u128
5273%endmacro
5274
5275IEMIMPL_P_BLEND pblendvb
5276IEMIMPL_P_BLEND blendvps
5277IEMIMPL_P_BLEND blendvpd
5278
5279
5280;;
5281; One of the v[p]blendv{b,ps,pd} variants
5282;
5283; @param 1 The instruction
5284;
5285; @param A0 Pointer to the first media register sized operand (output).
5286; @param A1 Pointer to the first media register sized operand (input).
5287; @param A2 Pointer to the second media register sized operand (input).
5288; @param A3 Pointer to the media register sized mask value (input).
5289%macro IEMIMPL_AVX_P_BLEND 1
5290BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5291 PROLOGUE_4_ARGS
5292 IEMIMPL_AVX_PROLOGUE
5293
5294 vmovdqu xmm0, [A1]
5295 vmovdqu xmm1, [A2]
5296 vmovdqu xmm2, [A3]
5297 %1 xmm0, xmm0, xmm1, xmm2
5298 vmovdqu [A0], xmm0
5299
5300 IEMIMPL_AVX_PROLOGUE
5301 EPILOGUE_4_ARGS
5302ENDPROC iemAImpl_ %+ %1 %+ _u128
5303
5304BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5305 PROLOGUE_4_ARGS
5306 IEMIMPL_AVX_PROLOGUE
5307
5308 vmovdqu ymm0, [A1]
5309 vmovdqu ymm1, [A2]
5310 vmovdqu ymm2, [A3]
5311 %1 ymm0, ymm0, ymm1, ymm2
5312 vmovdqu [A0], ymm0
5313
5314 IEMIMPL_AVX_PROLOGUE
5315 EPILOGUE_4_ARGS
5316ENDPROC iemAImpl_ %+ %1 %+ _u256
5317%endmacro
5318
5319IEMIMPL_AVX_P_BLEND vpblendvb
5320IEMIMPL_AVX_P_BLEND vblendvps
5321IEMIMPL_AVX_P_BLEND vblendvpd
5322
5323
5324;;
5325; palignr mm1, mm2/m64 instruction.
5326;
5327; @param A0 Pointer to the first media register sized operand (output).
5328; @param A1 The second register sized operand (input).
5329; @param A2 The 8-bit immediate.
5330BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5331 PROLOGUE_3_ARGS
5332 IEMIMPL_MMX_PROLOGUE
5333
5334 movzx A2, A2_8 ; must clear top bits
5335 movq mm0, [A0]
5336 movq mm1, A1
5337 lea T1, [.imm0 xWrtRIP]
5338 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5339 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5340 %else
5341 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5342 %endif
5343 lea T1, [T1 + T0*2]
5344 IBT_NOTRACK
5345 call T1
5346 movq [A0], mm0
5347
5348 IEMIMPL_MMX_EPILOGUE
5349 EPILOGUE_3_ARGS
5350 %assign bImm 0
5351 %rep 256
5352.imm %+ bImm:
5353 IBT_ENDBRxx_WITHOUT_NOTRACK
5354 palignr mm0, mm1, bImm
5355 ret
5356 %assign bImm bImm + 1
5357 %endrep
5358.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5359ENDPROC iemAImpl_palignr_u64
5360
5361
5362;;
5363; SSE instructions with 8-bit immediates of the form
5364; xxx xmm1, xmm2, imm8.
5365; where the instruction encoding takes up 6 bytes.
5366;
5367; @param 1 The instruction name.
5368;
5369; @param A0 Pointer to the first media register size operand (input/output).
5370; @param A1 Pointer to the second source media register size operand (input).
5371; @param A2 The 8-bit immediate
5372;
5373%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5374BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5375 PROLOGUE_3_ARGS
5376 IEMIMPL_SSE_PROLOGUE
5377
5378 movzx A2, A2_8 ; must clear top bits
5379 movdqu xmm0, [A0]
5380 movdqu xmm1, [A1]
5381 lea T1, [.imm0 xWrtRIP]
5382 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5383 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5384 lea T1, [T1 + T0*4]
5385 %else
5386 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5387 %endif
5388 IBT_NOTRACK
5389 call T1
5390 movdqu [A0], xmm0
5391
5392 IEMIMPL_SSE_EPILOGUE
5393 EPILOGUE_3_ARGS
5394 %assign bImm 0
5395 %rep 256
5396.imm %+ bImm:
5397 IBT_ENDBRxx_WITHOUT_NOTRACK
5398 %1 xmm0, xmm1, bImm
5399 ret
5400 int3
5401 %assign bImm bImm + 1
5402 %endrep
5403.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5404ENDPROC iemAImpl_ %+ %1 %+ _u128
5405%endmacro
5406
5407IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5408IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5409IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5410IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5411IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5412IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5413IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5414
5415
5416;;
5417; AVX instructions with 8-bit immediates of the form
5418; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5419; where the instruction encoding takes up 6 bytes.
5420;
5421; @param 1 The instruction name.
5422; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5423; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5424;
5425; @param A0 Pointer to the destination media register size operand (output).
5426; @param A1 Pointer to the first source media register size operand (input).
5427; @param A2 Pointer to the second source media register size operand (input).
5428; @param A3 The 8-bit immediate
5429;
5430%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5431 %if %2 == 1
5432BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5433 PROLOGUE_4_ARGS
5434 IEMIMPL_AVX_PROLOGUE
5435
5436 movzx A3, A3_8 ; must clear top bits
5437 movdqu xmm0, [A1]
5438 movdqu xmm1, [A2]
5439 lea T1, [.imm0 xWrtRIP]
5440 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5441 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5442 lea T1, [T1 + T0*4]
5443 %else
5444 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5445 %endif
5446 IBT_NOTRACK
5447 call T1
5448 movdqu [A0], xmm0
5449
5450 IEMIMPL_AVX_EPILOGUE
5451 EPILOGUE_4_ARGS
5452 %assign bImm 0
5453 %rep 256
5454.imm %+ bImm:
5455 IBT_ENDBRxx_WITHOUT_NOTRACK
5456 %1 xmm0, xmm0, xmm1, bImm
5457 ret
5458 int3
5459 %assign bImm bImm + 1
5460 %endrep
5461.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5462ENDPROC iemAImpl_ %+ %1 %+ _u128
5463 %endif
5464
5465 %if %3 == 1
5466BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5467 PROLOGUE_4_ARGS
5468 IEMIMPL_AVX_PROLOGUE
5469
5470 movzx A3, A3_8 ; must clear top bits
5471 vmovdqu ymm0, [A1]
5472 vmovdqu ymm1, [A2]
5473 lea T1, [.imm0 xWrtRIP]
5474 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5475 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5476 lea T1, [T1 + T0*4]
5477 %else
5478 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5479 %endif
5480 IBT_NOTRACK
5481 call T1
5482 vmovdqu [A0], ymm0
5483
5484 IEMIMPL_AVX_EPILOGUE
5485 EPILOGUE_4_ARGS
5486 %assign bImm 0
5487 %rep 256
5488.imm %+ bImm:
5489 IBT_ENDBRxx_WITHOUT_NOTRACK
5490 %1 ymm0, ymm0, ymm1, bImm
5491 ret
5492 int3
5493 %assign bImm bImm + 1
5494 %endrep
5495.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5496ENDPROC iemAImpl_ %+ %1 %+ _u256
5497 %endif
5498%endmacro
5499
5500IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5501IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5502IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5503IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendd, 1, 1
5504IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5505IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5506IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5507IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5508IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vmpsadbw, 1, 1
5509
5510
5511;;
5512; AVX instructions with 8-bit immediates of the form
5513; xxx {x,y}mm1, {x,y}mm2, imm8.
5514; where the instruction encoding takes up 6 bytes.
5515;
5516; @param 1 The instruction name.
5517; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5518; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5519;
5520; @param A0 Pointer to the destination media register size operand (output).
5521; @param A1 Pointer to the first source media register size operand (input).
5522; @param A2 The 8-bit immediate
5523;
5524%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 3
5525 %if %2 == 1
5526BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
5527 PROLOGUE_4_ARGS
5528 IEMIMPL_AVX_PROLOGUE
5529
5530 movzx A2, A2_8 ; must clear top bits
5531 movdqu xmm1, [A1]
5532 lea T1, [.imm0 xWrtRIP]
5533 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5534 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5535 lea T1, [T1 + T0*4]
5536 %else
5537 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5538 %endif
5539 IBT_NOTRACK
5540 call T1
5541 movdqu [A0], xmm0
5542
5543 IEMIMPL_AVX_EPILOGUE
5544 EPILOGUE_4_ARGS
5545 %assign bImm 0
5546 %rep 256
5547.imm %+ bImm:
5548 IBT_ENDBRxx_WITHOUT_NOTRACK
5549 %1 xmm0, xmm1, bImm
5550 ret
5551 int3
5552 %assign bImm bImm + 1
5553 %endrep
5554.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5555ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
5556 %endif
5557
5558 %if %3 == 1
5559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u256, 16
5560 PROLOGUE_4_ARGS
5561 IEMIMPL_AVX_PROLOGUE
5562
5563 movzx A2, A2_8 ; must clear top bits
5564 vmovdqu ymm1, [A1]
5565 lea T1, [.imm0 xWrtRIP]
5566 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5567 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5568 lea T1, [T1 + T0*4]
5569 %else
5570 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5571 %endif
5572 IBT_NOTRACK
5573 call T1
5574 vmovdqu [A0], ymm0
5575
5576 IEMIMPL_AVX_EPILOGUE
5577 EPILOGUE_4_ARGS
5578 %assign bImm 0
5579 %rep 256
5580.imm %+ bImm:
5581 IBT_ENDBRxx_WITHOUT_NOTRACK
5582 %1 ymm0, ymm1, bImm
5583 ret
5584 int3
5585 %assign bImm bImm + 1
5586 %endrep
5587.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5588ENDPROC iemAImpl_ %+ %1 %+ _imm_u256
5589 %endif
5590%endmacro
5591
5592IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 vpermilps, 1, 1
5593IEMIMPL_MEDIA_AVX_INSN_IMM8_2OP_6 vpermilpd, 1, 1
5594
5595
5596;;
5597; Need to move this as well somewhere better?
5598;
5599struc IEMPCMPISTRXSRC
5600 .uSrc1 resd 4
5601 .uSrc2 resd 4
5602endstruc
5603
5604struc IEMPCMPESTRXSRC
5605 .uSrc1 resd 4
5606 .uSrc2 resd 4
5607 .u64Rax resd 2
5608 .u64Rdx resd 2
5609endstruc
5610
5611;;
5612; The pcmpistri instruction.
5613;
5614; @param A0 Pointer to the ECX register to store the result to (output).
5615; @param A1 Pointer to the EFLAGS register.
5616; @param A2 Pointer to the structure containing the source operands (input).
5617; @param A3 The 8-bit immediate
5618;
5619BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5620 PROLOGUE_4_ARGS
5621 IEMIMPL_SSE_PROLOGUE
5622
5623 movzx A3, A3_8 ; must clear top bits
5624 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5625 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5626 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5627 lea T1, [.imm0 xWrtRIP]
5628 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5629 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5630 lea T1, [T1 + T0*4]
5631 %else
5632 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5633 %endif
5634 IBT_NOTRACK
5635 call T1
5636
5637 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5638 mov [T2], ecx
5639
5640 IEMIMPL_SSE_EPILOGUE
5641 EPILOGUE_4_ARGS
5642 %assign bImm 0
5643 %rep 256
5644.imm %+ bImm:
5645 IBT_ENDBRxx_WITHOUT_NOTRACK
5646 pcmpistri xmm0, xmm1, bImm
5647 ret
5648 int3
5649 %assign bImm bImm + 1
5650 %endrep
5651.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5652ENDPROC iemAImpl_pcmpistri_u128
5653
5654;;
5655; The pcmpestri instruction.
5656;
5657; @param A0 Pointer to the ECX register to store the result to (output).
5658; @param A1 Pointer to the EFLAGS register.
5659; @param A2 Pointer to the structure containing the source operands (input).
5660; @param A3 The 8-bit immediate
5661;
5662BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5663 PROLOGUE_4_ARGS
5664 IEMIMPL_SSE_PROLOGUE
5665
5666 movzx A3, A3_8 ; must clear top bits
5667 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5668 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5669 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5670 lea T1, [.imm0 xWrtRIP]
5671 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5672 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5673 lea T1, [T1 + T0*4]
5674 %else
5675 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5676 %endif
5677 push xDX ; xDX can be A1 or A2 depending on the calling convention
5678 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5679 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5680 IBT_NOTRACK
5681 call T1
5682
5683 pop xDX
5684 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5685 mov [T2], ecx
5686
5687 IEMIMPL_SSE_EPILOGUE
5688 EPILOGUE_4_ARGS
5689 %assign bImm 0
5690 %rep 256
5691.imm %+ bImm:
5692 IBT_ENDBRxx_WITHOUT_NOTRACK
5693 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5694 pcmpestri xmm0, xmm1, bImm
5695 ret
5696 %assign bImm bImm + 1
5697 %endrep
5698.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5699ENDPROC iemAImpl_pcmpestri_u128
5700
5701;;
5702; The pcmpistrm instruction template.
5703;
5704; @param A0 Pointer to the XMM0 register to store the result to (output).
5705; @param A1 Pointer to the EFLAGS register.
5706; @param A2 Pointer to the structure containing the source operands (input).
5707; @param A3 The 8-bit immediate
5708;
5709BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5710 PROLOGUE_4_ARGS
5711 IEMIMPL_SSE_PROLOGUE
5712
5713 movzx A3, A3_8 ; must clear top bits
5714 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5715 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5716 lea T1, [.imm0 xWrtRIP]
5717 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5718 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5719 lea T1, [T1 + T0*4]
5720 %else
5721 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5722 %endif
5723 IBT_NOTRACK
5724 call T1
5725
5726 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5727 movdqu [A0], xmm0
5728
5729 IEMIMPL_SSE_EPILOGUE
5730 EPILOGUE_4_ARGS
5731 %assign bImm 0
5732 %rep 256
5733.imm %+ bImm:
5734 IBT_ENDBRxx_WITHOUT_NOTRACK
5735 pcmpistrm xmm1, xmm2, bImm
5736 ret
5737 int3
5738 %assign bImm bImm + 1
5739 %endrep
5740.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5741ENDPROC iemAImpl_pcmpistrm_u128
5742
5743;;
5744; The pcmpestrm instruction template.
5745;
5746; @param A0 Pointer to the XMM0 register to store the result to (output).
5747; @param A1 Pointer to the EFLAGS register.
5748; @param A2 Pointer to the structure containing the source operands (input).
5749; @param A3 The 8-bit immediate
5750;
5751BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5752 PROLOGUE_4_ARGS
5753 IEMIMPL_SSE_PROLOGUE
5754
5755 movzx A3, A3_8 ; must clear top bits
5756 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5757 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5758 lea T1, [.imm0 xWrtRIP]
5759 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5760 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5761 lea T1, [T1 + T0*4]
5762 %else
5763 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5764 %endif
5765 push xDX ; xDX can be A1 or A2 depending on the calling convention
5766 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5767 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5768 IBT_NOTRACK
5769 call T1
5770
5771 pop xDX
5772 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5773 movdqu [A0], xmm0
5774
5775 IEMIMPL_SSE_EPILOGUE
5776 EPILOGUE_4_ARGS
5777 %assign bImm 0
5778 %rep 256
5779.imm %+ bImm:
5780 IBT_ENDBRxx_WITHOUT_NOTRACK
5781 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5782 pcmpestrm xmm1, xmm2, bImm
5783 ret
5784 %assign bImm bImm + 1
5785 %endrep
5786.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5787ENDPROC iemAImpl_pcmpestrm_u128
5788
5789
5790;;
5791; pinsrw instruction.
5792;
5793; @param A0 Pointer to the first media register size operand (input/output).
5794; @param A1 The 16 bit input operand (input).
5795; @param A2 The 8-bit immediate
5796;
5797BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5798 PROLOGUE_3_ARGS
5799 IEMIMPL_SSE_PROLOGUE
5800
5801 movzx A2, A2_8 ; must clear top bits
5802 movq mm0, [A0]
5803 lea T1, [.imm0 xWrtRIP]
5804 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5805 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5806 %else
5807 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5808 %endif
5809 lea T1, [T1 + T0]
5810 IBT_NOTRACK
5811 call T1
5812 movq [A0], mm0
5813
5814 IEMIMPL_SSE_EPILOGUE
5815 EPILOGUE_3_ARGS
5816 %assign bImm 0
5817 %rep 256
5818.imm %+ bImm:
5819 IBT_ENDBRxx_WITHOUT_NOTRACK
5820 pinsrw mm0, A1_32, bImm
5821 ret
5822 %assign bImm bImm + 1
5823 %endrep
5824.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5825ENDPROC iemAImpl_pinsrw_u64
5826
5827BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5828 PROLOGUE_3_ARGS
5829 IEMIMPL_SSE_PROLOGUE
5830
5831 movzx A2, A2_8 ; must clear top bits
5832 movdqu xmm0, [A0]
5833 lea T1, [.imm0 xWrtRIP]
5834 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5835 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5836 %else
5837 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5838 %endif
5839 lea T1, [T1 + T0*2]
5840 IBT_NOTRACK
5841 call T1
5842 movdqu [A0], xmm0
5843
5844 IEMIMPL_SSE_EPILOGUE
5845 EPILOGUE_3_ARGS
5846 %assign bImm 0
5847 %rep 256
5848.imm %+ bImm:
5849 IBT_ENDBRxx_WITHOUT_NOTRACK
5850 pinsrw xmm0, A1_32, bImm
5851 ret
5852 %assign bImm bImm + 1
5853 %endrep
5854.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5855ENDPROC iemAImpl_pinsrw_u128
5856
5857;;
5858; vpinsrw instruction.
5859;
5860; @param A0 Pointer to the first media register size operand (output).
5861; @param A1 Pointer to the source media register size operand (input).
5862; @param A2 The 16 bit input operand (input).
5863; @param A3 The 8-bit immediate
5864;
5865BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5866 PROLOGUE_4_ARGS
5867 IEMIMPL_SSE_PROLOGUE
5868
5869 movzx A3, A3_8 ; must clear top bits
5870 movdqu xmm0, [A1]
5871 lea T1, [.imm0 xWrtRIP]
5872 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5873 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5874 %else
5875 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5876 %endif
5877 lea T1, [T1 + T0*2]
5878 mov A1, A2 ; A2 requires longer encoding on Windows
5879 IBT_NOTRACK
5880 call T1
5881 movdqu [A0], xmm0
5882
5883 IEMIMPL_SSE_EPILOGUE
5884 EPILOGUE_4_ARGS
5885 %assign bImm 0
5886 %rep 256
5887.imm %+ bImm:
5888 IBT_ENDBRxx_WITHOUT_NOTRACK
5889 vpinsrw xmm0, xmm0, A1_32, bImm
5890 ret
5891 %assign bImm bImm + 1
5892 %endrep
5893.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5894ENDPROC iemAImpl_vpinsrw_u128
5895
5896
5897;;
5898; pextrw instruction.
5899;
5900; @param A0 Pointer to the 16bit output operand (output).
5901; @param A1 Pointer to the media register size operand (input).
5902; @param A2 The 8-bit immediate
5903;
5904BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5905 PROLOGUE_3_ARGS
5906 IEMIMPL_SSE_PROLOGUE
5907
5908 movzx A2, A2_8 ; must clear top bits
5909 movq mm0, A1
5910 lea T1, [.imm0 xWrtRIP]
5911 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5912 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5913 %else
5914 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5915 %endif
5916 lea T1, [T1 + T0]
5917 IBT_NOTRACK
5918 call T1
5919 mov word [A0], T0_16
5920
5921 IEMIMPL_SSE_EPILOGUE
5922 EPILOGUE_3_ARGS
5923 %assign bImm 0
5924 %rep 256
5925.imm %+ bImm:
5926 IBT_ENDBRxx_WITHOUT_NOTRACK
5927 pextrw T0_32, mm0, bImm
5928 ret
5929 %assign bImm bImm + 1
5930 %endrep
5931.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5932ENDPROC iemAImpl_pextrw_u64
5933
5934BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5935 PROLOGUE_3_ARGS
5936 IEMIMPL_SSE_PROLOGUE
5937
5938 movzx A2, A2_8 ; must clear top bits
5939 movdqu xmm0, [A1]
5940 lea T1, [.imm0 xWrtRIP]
5941 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5942 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5943 %else
5944 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5945 %endif
5946 lea T1, [T1 + T0*2]
5947 IBT_NOTRACK
5948 call T1
5949 mov word [A0], T0_16
5950
5951 IEMIMPL_SSE_EPILOGUE
5952 EPILOGUE_3_ARGS
5953 %assign bImm 0
5954 %rep 256
5955.imm %+ bImm:
5956 IBT_ENDBRxx_WITHOUT_NOTRACK
5957 pextrw T0_32, xmm0, bImm
5958 ret
5959 %assign bImm bImm + 1
5960 %endrep
5961.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5962ENDPROC iemAImpl_pextrw_u128
5963
5964;;
5965; vpextrw instruction.
5966;
5967; @param A0 Pointer to the 16bit output operand (output).
5968; @param A1 Pointer to the source media register size operand (input).
5969; @param A2 The 8-bit immediate
5970;
5971BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5972 PROLOGUE_3_ARGS
5973 IEMIMPL_SSE_PROLOGUE
5974
5975 movzx A2, A2_8 ; must clear top bits
5976 movdqu xmm0, [A1]
5977 lea T1, [.imm0 xWrtRIP]
5978 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5979 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5980 %else
5981 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5982 %endif
5983 lea T1, [T1 + T0*2]
5984 IBT_NOTRACK
5985 call T1
5986 mov word [A0], T0_16
5987
5988 IEMIMPL_SSE_EPILOGUE
5989 EPILOGUE_3_ARGS
5990 %assign bImm 0
5991 %rep 256
5992.imm %+ bImm:
5993 IBT_ENDBRxx_WITHOUT_NOTRACK
5994 vpextrw T0_32, xmm0, bImm
5995 ret
5996 %assign bImm bImm + 1
5997 %endrep
5998.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5999ENDPROC iemAImpl_vpextrw_u128
6000
6001
6002;;
6003; movmskp{s,d} SSE instruction template
6004;
6005; @param 1 The SSE instruction name.
6006; @param 2 The AVX instruction name.
6007;
6008; @param A0 Pointer to the output register (output/byte sized).
6009; @param A1 Pointer to the source media register size operand (input).
6010;
6011%macro IEMIMPL_MEDIA_MOVMSK_P 2
6012BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6013 PROLOGUE_2_ARGS
6014 IEMIMPL_SSE_PROLOGUE
6015
6016 movdqu xmm0, [A1]
6017 %1 T0, xmm0
6018 mov byte [A0], T0_8
6019
6020 IEMIMPL_SSE_EPILOGUE
6021 EPILOGUE_2_ARGS
6022ENDPROC iemAImpl_ %+ %1 %+ _u128
6023
6024BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
6025 PROLOGUE_2_ARGS
6026 IEMIMPL_AVX_PROLOGUE
6027
6028 movdqu xmm0, [A1]
6029 %2 T0, xmm0
6030 mov byte [A0], T0_8
6031
6032 IEMIMPL_AVX_EPILOGUE
6033 EPILOGUE_2_ARGS
6034ENDPROC iemAImpl_ %+ %2 %+ _u128
6035
6036BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
6037 PROLOGUE_2_ARGS
6038 IEMIMPL_AVX_PROLOGUE
6039
6040 vmovdqu ymm0, [A1]
6041 %2 T0, ymm0
6042 mov byte [A0], T0_8
6043
6044 IEMIMPL_AVX_EPILOGUE
6045 EPILOGUE_2_ARGS
6046ENDPROC iemAImpl_ %+ %2 %+ _u256
6047%endmacro
6048
6049IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
6050IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
6051
6052
6053;;
6054; Restores the SSE MXCSR register with the original value.
6055;
6056; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6057; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6058; @param 2 Expression giving the address of the FXSTATE of the guest.
6059;
6060; @note Restores the stack pointer.
6061;
6062%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
6063 sub xSP, 4
6064 stmxcsr [xSP]
6065 mov T0_32, [xSP]
6066 add xSP, 4
6067 ; Merge the status bits into the original MXCSR value.
6068 mov T1_32, [%2 + X86FXSTATE.MXCSR]
6069 and T0_32, X86_MXCSR_XCPT_FLAGS
6070 or T0_32, T1_32
6071 mov [%1], T0_32
6072
6073 ldmxcsr [xSP]
6074 add xSP, 4
6075%endmacro
6076
6077
6078;;
6079; cvttsd2si instruction - 32-bit variant.
6080;
6081; @param A0 FPU context (FXSTATE or XSAVEAREA).
6082; @param A1 Where to return the MXCSR value.
6083; @param A2 Pointer to the result operand (output).
6084; @param A3 Pointer to the second operand (input).
6085;
6086BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
6087 PROLOGUE_4_ARGS
6088 IEMIMPL_SSE_PROLOGUE
6089 SSE_LD_FXSTATE_MXCSR A0
6090
6091 cvttsd2si T0_32, [A3]
6092 mov dword [A2], T0_32
6093
6094 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6095 IEMIMPL_SSE_EPILOGUE
6096 EPILOGUE_4_ARGS
6097ENDPROC iemAImpl_cvttsd2si_i32_r64
6098
6099;;
6100; cvttsd2si instruction - 64-bit variant.
6101;
6102; @param A0 FPU context (FXSTATE or XSAVEAREA).
6103; @param A1 Where to return the MXCSR value.
6104; @param A2 Pointer to the result operand (output).
6105; @param A3 Pointer to the second operand (input).
6106;
6107BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
6108 PROLOGUE_4_ARGS
6109 IEMIMPL_SSE_PROLOGUE
6110 SSE_LD_FXSTATE_MXCSR A0
6111
6112 cvttsd2si T0, [A3]
6113 mov qword [A2], T0
6114
6115 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6116 IEMIMPL_SSE_EPILOGUE
6117 EPILOGUE_4_ARGS
6118ENDPROC iemAImpl_cvttsd2si_i64_r64
6119
6120
6121;;
6122; cvtsd2si instruction - 32-bit variant.
6123;
6124; @param A0 FPU context (FXSTATE or XSAVEAREA).
6125; @param A1 Where to return the MXCSR value.
6126; @param A2 Pointer to the result operand (output).
6127; @param A3 Pointer to the second operand (input).
6128;
6129BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
6130 PROLOGUE_4_ARGS
6131 IEMIMPL_SSE_PROLOGUE
6132 SSE_LD_FXSTATE_MXCSR A0
6133
6134 cvtsd2si T0_32, [A3]
6135 mov dword [A2], T0_32
6136
6137 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6138 IEMIMPL_SSE_EPILOGUE
6139 EPILOGUE_4_ARGS
6140ENDPROC iemAImpl_cvtsd2si_i32_r64
6141
6142;;
6143; cvtsd2si instruction - 64-bit variant.
6144;
6145; @param A0 FPU context (FXSTATE or XSAVEAREA).
6146; @param A1 Where to return the MXCSR value.
6147; @param A2 Pointer to the result operand (output).
6148; @param A3 Pointer to the second operand (input).
6149;
6150BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
6151 PROLOGUE_4_ARGS
6152 IEMIMPL_SSE_PROLOGUE
6153 SSE_LD_FXSTATE_MXCSR A0
6154
6155 cvtsd2si T0, [A3]
6156 mov qword [A2], T0
6157
6158 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6159 IEMIMPL_SSE_EPILOGUE
6160 EPILOGUE_4_ARGS
6161ENDPROC iemAImpl_cvtsd2si_i64_r64
6162
6163
6164;;
6165; cvttss2si instruction - 32-bit variant.
6166;
6167; @param A0 FPU context (FXSTATE or XSAVEAREA).
6168; @param A1 Where to return the MXCSR value.
6169; @param A2 Pointer to the result operand (output).
6170; @param A3 Pointer to the second operand (input).
6171;
6172BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
6173 PROLOGUE_4_ARGS
6174 IEMIMPL_SSE_PROLOGUE
6175 SSE_LD_FXSTATE_MXCSR A0
6176
6177 cvttss2si T0_32, [A3]
6178 mov dword [A2], T0_32
6179
6180 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6181 IEMIMPL_SSE_EPILOGUE
6182 EPILOGUE_4_ARGS
6183ENDPROC iemAImpl_cvttss2si_i32_r32
6184
6185;;
6186; cvttss2si instruction - 64-bit variant.
6187;
6188; @param A0 FPU context (FXSTATE or XSAVEAREA).
6189; @param A1 Where to return the MXCSR value.
6190; @param A2 Pointer to the result operand (output).
6191; @param A3 Pointer to the second operand (input).
6192;
6193BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
6194 PROLOGUE_4_ARGS
6195 IEMIMPL_SSE_PROLOGUE
6196 SSE_LD_FXSTATE_MXCSR A0
6197
6198 cvttss2si T0, [A3]
6199 mov qword [A2], T0
6200
6201 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6202 IEMIMPL_SSE_EPILOGUE
6203 EPILOGUE_4_ARGS
6204ENDPROC iemAImpl_cvttss2si_i64_r32
6205
6206
6207;;
6208; cvtss2si instruction - 32-bit variant.
6209;
6210; @param A0 FPU context (FXSTATE or XSAVEAREA).
6211; @param A1 Where to return the MXCSR value.
6212; @param A2 Pointer to the result operand (output).
6213; @param A3 Pointer to the second operand (input).
6214;
6215BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
6216 PROLOGUE_4_ARGS
6217 IEMIMPL_SSE_PROLOGUE
6218 SSE_LD_FXSTATE_MXCSR A0
6219
6220 cvtss2si T0_32, [A3]
6221 mov dword [A2], T0_32
6222
6223 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6224 IEMIMPL_SSE_EPILOGUE
6225 EPILOGUE_4_ARGS
6226ENDPROC iemAImpl_cvtss2si_i32_r32
6227
6228;;
6229; cvtss2si instruction - 64-bit variant.
6230;
6231; @param A0 FPU context (FXSTATE or XSAVEAREA).
6232; @param A1 Where to return the MXCSR value.
6233; @param A2 Pointer to the result operand (output).
6234; @param A3 Pointer to the second operand (input).
6235;
6236BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
6237 PROLOGUE_4_ARGS
6238 IEMIMPL_SSE_PROLOGUE
6239 SSE_LD_FXSTATE_MXCSR A0
6240
6241 cvtss2si T0, [A3]
6242 mov qword [A2], T0
6243
6244 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6245 IEMIMPL_SSE_EPILOGUE
6246 EPILOGUE_4_ARGS
6247ENDPROC iemAImpl_cvtss2si_i64_r32
6248
6249
6250;;
6251; cvtsi2ss instruction - 32-bit variant.
6252;
6253; @param A0 FPU context (FXSTATE or XSAVEAREA).
6254; @param A1 Where to return the MXCSR value.
6255; @param A2 Pointer to the result operand (output).
6256; @param A3 Pointer to the second operand (input).
6257;
6258BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
6259 PROLOGUE_4_ARGS
6260 IEMIMPL_SSE_PROLOGUE
6261 SSE_LD_FXSTATE_MXCSR A0
6262
6263 cvtsi2ss xmm0, dword [A3]
6264 movd dword [A2], xmm0
6265
6266 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6267 IEMIMPL_SSE_EPILOGUE
6268 EPILOGUE_4_ARGS
6269ENDPROC iemAImpl_cvtsi2ss_r32_i32
6270
6271;;
6272; cvtsi2ss instruction - 64-bit variant.
6273;
6274; @param A0 FPU context (FXSTATE or XSAVEAREA).
6275; @param A1 Where to return the MXCSR value.
6276; @param A2 Pointer to the result operand (output).
6277; @param A3 Pointer to the second operand (input).
6278;
6279BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
6280 PROLOGUE_4_ARGS
6281 IEMIMPL_SSE_PROLOGUE
6282 SSE_LD_FXSTATE_MXCSR A0
6283
6284 cvtsi2ss xmm0, qword [A3]
6285 movd dword [A2], xmm0
6286
6287 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6288 IEMIMPL_SSE_EPILOGUE
6289 EPILOGUE_4_ARGS
6290ENDPROC iemAImpl_cvtsi2ss_r32_i64
6291
6292
6293;;
6294; cvtsi2sd instruction - 32-bit variant.
6295;
6296; @param A0 FPU context (FXSTATE or XSAVEAREA).
6297; @param A1 Where to return the MXCSR value.
6298; @param A2 Pointer to the result operand (output).
6299; @param A3 Pointer to the second operand (input).
6300;
6301BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6302 PROLOGUE_4_ARGS
6303 IEMIMPL_SSE_PROLOGUE
6304 SSE_LD_FXSTATE_MXCSR A0
6305
6306 cvtsi2sd xmm0, dword [A3]
6307 movq [A2], xmm0
6308
6309 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6310 IEMIMPL_SSE_EPILOGUE
6311 EPILOGUE_4_ARGS
6312ENDPROC iemAImpl_cvtsi2sd_r64_i32
6313
6314;;
6315; cvtsi2sd instruction - 64-bit variant.
6316;
6317; @param A0 FPU context (FXSTATE or XSAVEAREA).
6318; @param A1 Where to return the MXCSR value.
6319; @param A2 Pointer to the result operand (output).
6320; @param A3 Pointer to the second operand (input).
6321;
6322BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6323 PROLOGUE_4_ARGS
6324 IEMIMPL_SSE_PROLOGUE
6325 SSE_LD_FXSTATE_MXCSR A0
6326
6327 cvtsi2sd xmm0, qword [A3]
6328 movq [A2], xmm0
6329
6330 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6331 IEMIMPL_SSE_EPILOGUE
6332 EPILOGUE_4_ARGS
6333ENDPROC iemAImpl_cvtsi2sd_r64_i64
6334
6335
6336;;
6337; Initialize the SSE MXCSR register using the guest value partially to
6338; account for rounding mode.
6339;
6340; @uses 4 bytes of stack to save the original value, T0.
6341; @param 1 Expression giving the address of the MXCSR register of the guest.
6342;
6343%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6344 sub xSP, 4
6345
6346 stmxcsr [xSP]
6347 mov T0_32, [%1]
6348 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6349 or T0_32, X86_MXCSR_XCPT_MASK
6350 sub xSP, 4
6351 mov [xSP], T0_32
6352 ldmxcsr [xSP]
6353 add xSP, 4
6354%endmacro
6355
6356
6357;;
6358; Restores the SSE MXCSR register with the original value.
6359;
6360; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6361; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6362;
6363; @note Restores the stack pointer.
6364;
6365%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6366 sub xSP, 4
6367 stmxcsr [xSP]
6368 mov T0_32, [xSP]
6369 add xSP, 4
6370 ; Merge the status bits into the original MXCSR value.
6371 mov T1_32, [%1]
6372 and T0_32, X86_MXCSR_XCPT_FLAGS
6373 or T0_32, T1_32
6374 mov [%1], T0_32
6375
6376 ldmxcsr [xSP]
6377 add xSP, 4
6378%endmacro
6379
6380
6381;
6382; UCOMISS (SSE)
6383;
6384; @param A0 Pointer to the MXCSR value (input/output).
6385; @param A1 Pointer to the EFLAGS value (input/output).
6386; @param A2 Pointer to the first source operand (aka readonly destination).
6387; @param A3 Pointer to the second source operand.
6388;
6389BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6390 PROLOGUE_4_ARGS
6391 IEMIMPL_SSE_PROLOGUE
6392 SSE_LD_FXSTATE_MXCSR_ONLY A0
6393
6394 movdqu xmm0, [A2]
6395 movdqu xmm1, [A3]
6396 ucomiss xmm0, xmm1
6397 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6398
6399 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6400 IEMIMPL_SSE_EPILOGUE
6401 EPILOGUE_4_ARGS
6402ENDPROC iemAImpl_ucomiss_u128
6403
6404BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6405 PROLOGUE_4_ARGS
6406 IEMIMPL_SSE_PROLOGUE
6407 SSE_LD_FXSTATE_MXCSR_ONLY A0
6408
6409 movdqu xmm0, [A2]
6410 movdqu xmm1, [A3]
6411 vucomiss xmm0, xmm1
6412 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6413
6414 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6415 IEMIMPL_SSE_EPILOGUE
6416 EPILOGUE_4_ARGS
6417ENDPROC iemAImpl_vucomiss_u128
6418
6419
6420;
6421; UCOMISD (SSE)
6422;
6423; @param A0 Pointer to the MXCSR value (input/output).
6424; @param A1 Pointer to the EFLAGS value (input/output).
6425; @param A2 Pointer to the first source operand (aka readonly destination).
6426; @param A3 Pointer to the second source operand.
6427;
6428BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6429 PROLOGUE_4_ARGS
6430 IEMIMPL_SSE_PROLOGUE
6431 SSE_LD_FXSTATE_MXCSR_ONLY A0
6432
6433 movdqu xmm0, [A2]
6434 movdqu xmm1, [A3]
6435 ucomisd xmm0, xmm1
6436 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6437
6438 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6439 IEMIMPL_SSE_EPILOGUE
6440 EPILOGUE_4_ARGS
6441ENDPROC iemAImpl_ucomisd_u128
6442
6443BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6444 PROLOGUE_4_ARGS
6445 IEMIMPL_SSE_PROLOGUE
6446 SSE_LD_FXSTATE_MXCSR_ONLY A0
6447
6448 movdqu xmm0, [A2]
6449 movdqu xmm1, [A3]
6450 vucomisd xmm0, xmm1
6451 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6452
6453 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6454 IEMIMPL_SSE_EPILOGUE
6455 EPILOGUE_4_ARGS
6456ENDPROC iemAImpl_vucomisd_u128
6457
6458;
6459; COMISS (SSE)
6460;
6461; @param A0 Pointer to the MXCSR value (input/output).
6462; @param A1 Pointer to the EFLAGS value (input/output).
6463; @param A2 Pointer to the first source operand (aka readonly destination).
6464; @param A3 Pointer to the second source operand.
6465;
6466BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6467 PROLOGUE_4_ARGS
6468 IEMIMPL_SSE_PROLOGUE
6469 SSE_LD_FXSTATE_MXCSR_ONLY A0
6470
6471 movdqu xmm0, [A2]
6472 movdqu xmm1, [A3]
6473 comiss xmm0, xmm1
6474 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6475
6476 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6477 IEMIMPL_SSE_EPILOGUE
6478 EPILOGUE_4_ARGS
6479ENDPROC iemAImpl_comiss_u128
6480
6481BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6482 PROLOGUE_4_ARGS
6483 IEMIMPL_SSE_PROLOGUE
6484 SSE_LD_FXSTATE_MXCSR_ONLY A0
6485
6486 movdqu xmm0, [A2]
6487 movdqu xmm1, [A3]
6488 vcomiss xmm0, xmm1
6489 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6490
6491 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6492 IEMIMPL_SSE_EPILOGUE
6493 EPILOGUE_4_ARGS
6494ENDPROC iemAImpl_vcomiss_u128
6495
6496
6497;
6498; COMISD (SSE)
6499;
6500; @param A0 Pointer to the MXCSR value (input/output).
6501; @param A1 Pointer to the EFLAGS value (input/output).
6502; @param A2 Pointer to the first source operand (aka readonly destination).
6503; @param A3 Pointer to the second source operand.
6504;
6505BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6506 PROLOGUE_4_ARGS
6507 IEMIMPL_SSE_PROLOGUE
6508 SSE_LD_FXSTATE_MXCSR_ONLY A0
6509
6510 movdqu xmm0, [A2]
6511 movdqu xmm1, [A3]
6512 comisd xmm0, xmm1
6513 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6514
6515 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6516 IEMIMPL_SSE_EPILOGUE
6517 EPILOGUE_4_ARGS
6518ENDPROC iemAImpl_comisd_u128
6519
6520BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6521 PROLOGUE_4_ARGS
6522 IEMIMPL_SSE_PROLOGUE
6523 SSE_LD_FXSTATE_MXCSR_ONLY A0
6524
6525 movdqu xmm0, [A2]
6526 movdqu xmm1, [A3]
6527 vcomisd xmm0, xmm1
6528 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6529
6530 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6531 IEMIMPL_SSE_EPILOGUE
6532 EPILOGUE_4_ARGS
6533ENDPROC iemAImpl_vcomisd_u128
6534
6535
6536;;
6537; Need to move this as well somewhere better?
6538;
6539struc IEMMEDIAF2XMMSRC
6540 .uSrc1 resd 4
6541 .uSrc2 resd 4
6542endstruc
6543
6544
6545;
6546; CMPPS (SSE)
6547;
6548; @param A0 Pointer to the MXCSR value (input/output).
6549; @param A1 Pointer to the first media register size operand (output).
6550; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6551; @param A3 The 8-bit immediate (input).
6552;
6553BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6554 PROLOGUE_4_ARGS
6555 IEMIMPL_SSE_PROLOGUE
6556 SSE_LD_FXSTATE_MXCSR_ONLY A0
6557
6558 movzx A3, A3_8 ; must clear top bits
6559 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6560 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6561 lea T1, [.imm0 xWrtRIP]
6562 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6563 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6564 %else
6565 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6566 %endif
6567 lea T1, [T1 + T0]
6568 IBT_NOTRACK
6569 call T1
6570 movdqu [A1], xmm0
6571
6572 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6573 IEMIMPL_SSE_EPILOGUE
6574 EPILOGUE_4_ARGS
6575 %assign bImm 0
6576 %rep 256
6577.imm %+ bImm:
6578 IBT_ENDBRxx_WITHOUT_NOTRACK
6579 cmpps xmm0, xmm1, bImm
6580 ret
6581 %assign bImm bImm + 1
6582 %endrep
6583.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6584ENDPROC iemAImpl_cmpps_u128
6585
6586;;
6587; SSE instructions with 8-bit immediates of the form
6588; xxx xmm1, xmm2, imm8.
6589; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6590; register.
6591;
6592; @param 1 The instruction name.
6593;
6594; @param A0 Pointer to the MXCSR value (input/output).
6595; @param A1 Pointer to the first media register size operand (output).
6596; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6597; @param A3 The 8-bit immediate (input).
6598;
6599%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6600BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6601 PROLOGUE_4_ARGS
6602 IEMIMPL_SSE_PROLOGUE
6603 SSE_LD_FXSTATE_MXCSR_ONLY A0
6604
6605 movzx A3, A3_8 ; must clear top bits
6606 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6607 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6608 lea T1, [.imm0 xWrtRIP]
6609 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6610 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6611 %else
6612 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6613 %endif
6614 lea T1, [T1 + T0*2]
6615 IBT_NOTRACK
6616 call T1
6617 movdqu [A1], xmm0
6618
6619 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6620 IEMIMPL_SSE_EPILOGUE
6621 EPILOGUE_4_ARGS
6622 %assign bImm 0
6623 %rep 256
6624.imm %+ bImm:
6625 IBT_ENDBRxx_WITHOUT_NOTRACK
6626 %1 xmm0, xmm1, bImm
6627 ret
6628 %assign bImm bImm + 1
6629 %endrep
6630.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6631ENDPROC iemAImpl_ %+ %1 %+ _u128
6632%endmacro
6633
6634IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6635IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6636IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6637
6638;;
6639; SSE instructions with 8-bit immediates of the form
6640; xxx xmm1, xmm2, imm8.
6641; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6642; register.
6643;
6644; @param 1 The instruction name.
6645;
6646; @param A0 Pointer to the MXCSR value (input/output).
6647; @param A1 Pointer to the first media register size operand (output).
6648; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6649; @param A3 The 8-bit immediate (input).
6650;
6651%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6652BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6653 PROLOGUE_4_ARGS
6654 IEMIMPL_SSE_PROLOGUE
6655 SSE_LD_FXSTATE_MXCSR_ONLY A0
6656
6657 movzx A3, A3_8 ; must clear top bits
6658 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6659 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6660 lea T1, [.imm0 xWrtRIP]
6661 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6662 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6663 lea T1, [T1 + T0*4]
6664 %else
6665 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6666 %endif
6667 IBT_NOTRACK
6668 call T1
6669 movdqu [A1], xmm0
6670
6671 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6672 IEMIMPL_SSE_EPILOGUE
6673 EPILOGUE_4_ARGS
6674 %assign bImm 0
6675 %rep 256
6676.imm %+ bImm:
6677 IBT_ENDBRxx_WITHOUT_NOTRACK
6678 %1 xmm0, xmm1, bImm
6679 ret
6680 int3
6681 %assign bImm bImm + 1
6682 %endrep
6683.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6684ENDPROC iemAImpl_ %+ %1 %+ _u128
6685%endmacro
6686
6687IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6688IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6689IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6690IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6691IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6692IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6693
6694
6695;;
6696; SSE instructions of the form
6697; xxx mm, xmm.
6698; and we need to load and save the MXCSR register.
6699;
6700; @param 1 The instruction name.
6701;
6702; @param A0 Pointer to the MXCSR value (input/output).
6703; @param A1 Pointer to the first MMX register sized operand (output).
6704; @param A2 Pointer to the media register sized operand (input).
6705;
6706%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6707BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6708 PROLOGUE_3_ARGS
6709 IEMIMPL_SSE_PROLOGUE
6710 SSE_LD_FXSTATE_MXCSR_ONLY A0
6711
6712 movdqu xmm0, [A2]
6713 %1 mm0, xmm0
6714 movq [A1], mm0
6715
6716 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6717 IEMIMPL_SSE_EPILOGUE
6718 EPILOGUE_3_ARGS
6719ENDPROC iemAImpl_ %+ %1 %+ _u128
6720%endmacro
6721
6722IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6723IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6724
6725;;
6726; SSE instructions of the form
6727; xxx xmm, xmm/m64.
6728; and we need to load and save the MXCSR register.
6729;
6730; @param 1 The instruction name.
6731;
6732; @param A0 Pointer to the MXCSR value (input/output).
6733; @param A1 Pointer to the first media register sized operand (input/output).
6734; @param A2 The 64bit source value from a MMX media register (input)
6735;
6736%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6737BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6738 PROLOGUE_3_ARGS
6739 IEMIMPL_SSE_PROLOGUE
6740 SSE_LD_FXSTATE_MXCSR_ONLY A0
6741
6742 movdqu xmm0, [A1]
6743 movq mm0, A2
6744 %1 xmm0, mm0
6745 movdqu [A1], xmm0
6746
6747 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6748 IEMIMPL_SSE_EPILOGUE
6749 EPILOGUE_3_ARGS
6750ENDPROC iemAImpl_ %+ %1 %+ _u128
6751%endmacro
6752
6753IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6754IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6755
6756;;
6757; SSE instructions of the form
6758; xxx mm, xmm/m64.
6759; and we need to load and save the MXCSR register.
6760;
6761; @param 1 The instruction name.
6762;
6763; @param A0 Pointer to the MXCSR value (input/output).
6764; @param A1 Pointer to the first MMX media register sized operand (output).
6765; @param A2 The 64bit source value (input).
6766;
6767%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6768BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6769 PROLOGUE_3_ARGS
6770 IEMIMPL_SSE_PROLOGUE
6771 SSE_LD_FXSTATE_MXCSR_ONLY A0
6772
6773 movq xmm0, A2
6774 %1 mm0, xmm0
6775 movq [A1], mm0
6776
6777 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6778 IEMIMPL_SSE_EPILOGUE
6779 EPILOGUE_3_ARGS
6780ENDPROC iemAImpl_ %+ %1 %+ _u128
6781%endmacro
6782
6783IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6784IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6785
6786;
6787; All forms of RDRAND and RDSEED
6788;
6789; @param A0 Pointer to the destination operand.
6790; @param A1 Pointer to the EFLAGS value (input/output).
6791;
6792%macro IEMIMPL_RDRAND_RDSEED 3
6793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6794 PROLOGUE_2_ARGS
6795
6796 %1 %2
6797 mov [A0], %2
6798 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6799
6800 EPILOGUE_2_ARGS
6801ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6802%endmacro
6803
6804IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6805IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6806IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6807IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6808IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6809IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6810
6811
6812;;
6813; sha1rnds4 xmm1, xmm2, imm8.
6814;
6815; @param 1 The instruction name.
6816;
6817; @param A0 Pointer to the first media register size operand (input/output).
6818; @param A1 Pointer to the second source media register size operand (input).
6819; @param A2 The 8-bit immediate
6820;
6821BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6822 PROLOGUE_3_ARGS
6823 IEMIMPL_SSE_PROLOGUE
6824
6825 movzx A2, A2_8 ; must clear top bits
6826 movdqu xmm0, [A0]
6827 movdqu xmm1, [A1]
6828 lea T1, [.imm0 xWrtRIP]
6829 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6830 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6831 %else
6832 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6833 %endif
6834 lea T1, [T1 + T0*2]
6835 IBT_NOTRACK
6836 call T1
6837 movdqu [A0], xmm0
6838
6839 IEMIMPL_SSE_EPILOGUE
6840 EPILOGUE_3_ARGS
6841 %assign bImm 0
6842 %rep 256
6843.imm %+ bImm:
6844 IBT_ENDBRxx_WITHOUT_NOTRACK
6845 sha1rnds4 xmm0, xmm1, bImm
6846 ret
6847 %assign bImm bImm + 1
6848 %endrep
6849.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6850ENDPROC iemAImpl_sha1rnds4_u128
6851
6852
6853;;
6854; sha256rnds2 xmm1, xmm2, <XMM0>.
6855;
6856; @param 1 The instruction name.
6857;
6858; @param A0 Pointer to the first media register size operand (input/output).
6859; @param A1 Pointer to the second source media register size operand (input).
6860; @param A2 Pointer to the implicit XMM0 constants (input).
6861;
6862BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6863 PROLOGUE_3_ARGS
6864 IEMIMPL_SSE_PROLOGUE
6865
6866 movdqu xmm0, [A2]
6867 movdqu xmm1, [A0]
6868 movdqu xmm2, [A1]
6869 sha256rnds2 xmm1, xmm2
6870 movdqu [A0], xmm1
6871
6872 IEMIMPL_SSE_EPILOGUE
6873 EPILOGUE_3_ARGS
6874ENDPROC iemAImpl_sha256rnds2_u128
6875
6876
6877;
6878; 32-bit forms of ADCX and ADOX
6879;
6880; @param A0 Pointer to the destination operand (input/output).
6881; @param A1 32-bit source operand 1 (input).
6882; @param A2 Pointer to the EFLAGS value (input/output).
6883;
6884%macro IEMIMPL_ADX_32 2
6885BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6886 PROLOGUE_4_ARGS
6887
6888 IEM_LOAD_FLAGS A2, %2, 0
6889 %1 A1_32, [A0]
6890 mov [A0], A1_32
6891 IEM_SAVE_FLAGS A2, %2, 0
6892
6893 EPILOGUE_4_ARGS
6894ENDPROC iemAImpl_ %+ %1 %+ _u32
6895%endmacro
6896
6897;
6898; 64-bit forms of ADCX and ADOX
6899;
6900; @param A0 Pointer to the destination operand (input/output).
6901; @param A1 64-bit source operand 1 (input).
6902; @param A2 Pointer to the EFLAGS value (input/output).
6903;
6904%macro IEMIMPL_ADX_64 2
6905BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6906 PROLOGUE_4_ARGS
6907
6908 IEM_LOAD_FLAGS A2, %2, 0
6909 %1 A1, [A0]
6910 mov [A0], A1
6911 IEM_SAVE_FLAGS A2, %2, 0
6912
6913 EPILOGUE_4_ARGS
6914ENDPROC iemAImpl_ %+ %1 %+ _u64
6915%endmacro
6916
6917IEMIMPL_ADX_32 adcx, X86_EFL_CF
6918IEMIMPL_ADX_64 adcx, X86_EFL_CF
6919
6920IEMIMPL_ADX_32 adox, X86_EFL_OF
6921IEMIMPL_ADX_64 adox, X86_EFL_OF
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette