VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 100602

最後變更 在這個檔案從100602是 100602,由 vboxsync 提交於 20 月 前

VMM/IEM: Implement vpaddsb/vpaddsw instruction emulations, bugref:9898

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 187.6 KB
 
1; $Id: IEMAllAImpl.asm 100602 2023-07-17 12:13:59Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.alldomusa.eu.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90 IBT_ENDBRxx
91%endmacro
92
93
94;
95; We employ some macro assembly here to hid the calling convention differences.
96;
97%ifdef RT_ARCH_AMD64
98 %macro PROLOGUE_1_ARGS 0
99 %endmacro
100 %macro EPILOGUE_1_ARGS 0
101 ret
102 %endmacro
103 %macro EPILOGUE_1_ARGS_EX 0
104 ret
105 %endmacro
106
107 %macro PROLOGUE_2_ARGS 0
108 %endmacro
109 %macro EPILOGUE_2_ARGS 0
110 ret
111 %endmacro
112 %macro EPILOGUE_2_ARGS_EX 1
113 ret
114 %endmacro
115
116 %macro PROLOGUE_3_ARGS 0
117 %endmacro
118 %macro EPILOGUE_3_ARGS 0
119 ret
120 %endmacro
121 %macro EPILOGUE_3_ARGS_EX 1
122 ret
123 %endmacro
124
125 %macro PROLOGUE_4_ARGS 0
126 %endmacro
127 %macro EPILOGUE_4_ARGS 0
128 ret
129 %endmacro
130 %macro EPILOGUE_4_ARGS_EX 1
131 ret
132 %endmacro
133
134 %ifdef ASM_CALL64_GCC
135 %define A0 rdi
136 %define A0_32 edi
137 %define A0_16 di
138 %define A0_8 dil
139
140 %define A1 rsi
141 %define A1_32 esi
142 %define A1_16 si
143 %define A1_8 sil
144
145 %define A2 rdx
146 %define A2_32 edx
147 %define A2_16 dx
148 %define A2_8 dl
149
150 %define A3 rcx
151 %define A3_32 ecx
152 %define A3_16 cx
153 %endif
154
155 %ifdef ASM_CALL64_MSC
156 %define A0 rcx
157 %define A0_32 ecx
158 %define A0_16 cx
159 %define A0_8 cl
160
161 %define A1 rdx
162 %define A1_32 edx
163 %define A1_16 dx
164 %define A1_8 dl
165
166 %define A2 r8
167 %define A2_32 r8d
168 %define A2_16 r8w
169 %define A2_8 r8b
170
171 %define A3 r9
172 %define A3_32 r9d
173 %define A3_16 r9w
174 %endif
175
176 %define T0 rax
177 %define T0_32 eax
178 %define T0_16 ax
179 %define T0_8 al
180
181 %define T1 r11
182 %define T1_32 r11d
183 %define T1_16 r11w
184 %define T1_8 r11b
185
186 %define T2 r10 ; only AMD64
187 %define T2_32 r10d
188 %define T2_16 r10w
189 %define T2_8 r10b
190
191%else
192 ; x86
193 %macro PROLOGUE_1_ARGS 0
194 push edi
195 %endmacro
196 %macro EPILOGUE_1_ARGS 0
197 pop edi
198 ret 0
199 %endmacro
200 %macro EPILOGUE_1_ARGS_EX 1
201 pop edi
202 ret %1
203 %endmacro
204
205 %macro PROLOGUE_2_ARGS 0
206 push edi
207 %endmacro
208 %macro EPILOGUE_2_ARGS 0
209 pop edi
210 ret 0
211 %endmacro
212 %macro EPILOGUE_2_ARGS_EX 1
213 pop edi
214 ret %1
215 %endmacro
216
217 %macro PROLOGUE_3_ARGS 0
218 push ebx
219 mov ebx, [esp + 4 + 4]
220 push edi
221 %endmacro
222 %macro EPILOGUE_3_ARGS_EX 1
223 %if (%1) < 4
224 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
225 %endif
226 pop edi
227 pop ebx
228 ret %1
229 %endmacro
230 %macro EPILOGUE_3_ARGS 0
231 EPILOGUE_3_ARGS_EX 4
232 %endmacro
233
234 %macro PROLOGUE_4_ARGS 0
235 push ebx
236 push edi
237 push esi
238 mov ebx, [esp + 12 + 4 + 0]
239 mov esi, [esp + 12 + 4 + 4]
240 %endmacro
241 %macro EPILOGUE_4_ARGS_EX 1
242 %if (%1) < 8
243 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
244 %endif
245 pop esi
246 pop edi
247 pop ebx
248 ret %1
249 %endmacro
250 %macro EPILOGUE_4_ARGS 0
251 EPILOGUE_4_ARGS_EX 8
252 %endmacro
253
254 %define A0 ecx
255 %define A0_32 ecx
256 %define A0_16 cx
257 %define A0_8 cl
258
259 %define A1 edx
260 %define A1_32 edx
261 %define A1_16 dx
262 %define A1_8 dl
263
264 %define A2 ebx
265 %define A2_32 ebx
266 %define A2_16 bx
267 %define A2_8 bl
268
269 %define A3 esi
270 %define A3_32 esi
271 %define A3_16 si
272
273 %define T0 eax
274 %define T0_32 eax
275 %define T0_16 ax
276 %define T0_8 al
277
278 %define T1 edi
279 %define T1_32 edi
280 %define T1_16 di
281%endif
282
283
284;;
285; Load the relevant flags from [%1] if there are undefined flags (%3).
286;
287; @remarks Clobbers T0, stack. Changes EFLAGS.
288; @param A2 The register pointing to the flags.
289; @param 1 The parameter (A0..A3) pointing to the eflags.
290; @param 2 The set of modified flags.
291; @param 3 The set of undefined flags.
292;
293%macro IEM_MAYBE_LOAD_FLAGS 3
294 ;%if (%3) != 0
295 pushf ; store current flags
296 mov T0_32, [%1] ; load the guest flags
297 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
298 and T0_32, (%2 | %3) ; select the modified and undefined flags.
299 or [xSP], T0 ; merge guest flags with host flags.
300 popf ; load the mixed flags.
301 ;%endif
302%endmacro
303
304;;
305; Load the relevant flags from [%1].
306;
307; @remarks Clobbers T0, stack. Changes EFLAGS.
308; @param A2 The register pointing to the flags.
309; @param 1 The parameter (A0..A3) pointing to the eflags.
310; @param 2 The set of flags to load.
311; @param 3 The set of undefined flags.
312;
313%macro IEM_LOAD_FLAGS 3
314 pushf ; store current flags
315 mov T0_32, [%1] ; load the guest flags
316 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
317 and T0_32, (%2 | %3) ; select the modified and undefined flags.
318 or [xSP], T0 ; merge guest flags with host flags.
319 popf ; load the mixed flags.
320%endmacro
321
322;;
323; Update the flag.
324;
325; @remarks Clobbers T0, T1, stack.
326; @param 1 The register pointing to the EFLAGS.
327; @param 2 The mask of modified flags to save.
328; @param 3 The mask of undefined flags to (maybe) save.
329;
330%macro IEM_SAVE_FLAGS 3
331 %if (%2 | %3) != 0
332 pushf
333 pop T1
334 mov T0_32, [%1] ; flags
335 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
336 and T1_32, (%2 | %3) ; select the modified and undefined flags.
337 or T0_32, T1_32 ; combine the flags.
338 mov [%1], T0_32 ; save the flags.
339 %endif
340%endmacro
341
342;;
343; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
344;
345; @remarks Clobbers T0, T1, stack.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 Mask of additional flags to always set.
350;
351%macro IEM_SAVE_AND_ADJUST_FLAGS 4
352 %if (%2 | %3 | %4) != 0
353 pushf
354 pop T1
355 mov T0_32, [%1] ; load flags.
356 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
357 and T1_32, (%2) ; select the modified flags.
358 or T0_32, T1_32 ; combine the flags.
359 %if (%4) != 0
360 or T0_32, %4 ; add the always set flags.
361 %endif
362 mov [%1], T0_32 ; save the result.
363 %endif
364%endmacro
365
366;;
367; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
368; signed input (%4[%5]) and parity index (%6).
369;
370; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
371; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
372; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
373;
374; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
375; @param 1 The register pointing to the EFLAGS.
376; @param 2 The mask of modified flags to save.
377; @param 3 Mask of additional flags to always clear
378; @param 4 The result register to set SF by.
379; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
380; @param 6 The (full) register containing the parity table index. Will be modified!
381
382%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
383 %ifdef RT_ARCH_AMD64
384 pushf
385 pop T2
386 %else
387 push T0
388 pushf
389 pop T0
390 %endif
391 mov T1_32, [%1] ; load flags.
392 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
393 %ifdef RT_ARCH_AMD64
394 and T2_32, (%2) ; select the modified flags.
395 or T1_32, T2_32 ; combine the flags.
396 %else
397 and T0_32, (%2) ; select the modified flags.
398 or T1_32, T0_32 ; combine the flags.
399 pop T0
400 %endif
401
402 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
403 bt %4, %5 - 1
404 jnc %%sf_clear
405 or T1_32, X86_EFL_SF
406 %%sf_clear:
407
408 ; Parity last.
409 and %6, 0xff
410 %ifdef RT_ARCH_AMD64
411 lea T2, [NAME(g_afParity) xWrtRIP]
412 or T1_8, [T2 + %6]
413 %else
414 or T1_8, [NAME(g_afParity) + %6]
415 %endif
416
417 mov [%1], T1_32 ; save the result.
418%endmacro
419
420;;
421; Calculates the new EFLAGS using fixed clear and set bit masks.
422;
423; @remarks Clobbers T0.
424; @param 1 The register pointing to the EFLAGS.
425; @param 2 Mask of additional flags to always clear
426; @param 3 Mask of additional flags to always set.
427;
428%macro IEM_ADJUST_FLAGS 3
429 %if (%2 | %3) != 0
430 mov T0_32, [%1] ; Load flags.
431 %if (%2) != 0
432 and T0_32, ~(%2) ; Remove the always cleared flags.
433 %endif
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 mov [%1], T0_32 ; Save the result.
438 %endif
439%endmacro
440
441;;
442; Calculates the new EFLAGS using fixed clear and set bit masks.
443;
444; @remarks Clobbers T0, %4, EFLAGS.
445; @param 1 The register pointing to the EFLAGS.
446; @param 2 Mask of additional flags to always clear
447; @param 3 Mask of additional flags to always set.
448; @param 4 The (full) register containing the parity table index. Will be modified!
449;
450%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
451 mov T0_32, [%1] ; Load flags.
452 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
453 %if (%3) != 0
454 or T0_32, %3 ; Add the always set flags.
455 %endif
456 and %4, 0xff
457 %ifdef RT_ARCH_AMD64
458 lea T2, [NAME(g_afParity) xWrtRIP]
459 or T0_8, [T2 + %4]
460 %else
461 or T0_8, [NAME(g_afParity) + %4]
462 %endif
463 mov [%1], T0_32 ; Save the result.
464%endmacro
465
466
467;;
468; Checks that the size expression %1 matches %2 adjusted according to
469; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
470; @param 1 The jump array size assembly expression.
471; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
472;
473%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
474 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
475 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
476 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
477 %else
478 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
479 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
480 %endif
481%endmacro
482
483
484;*********************************************************************************************************************************
485;* External Symbols *
486;*********************************************************************************************************************************
487extern NAME(g_afParity)
488
489
490;;
491; Macro for implementing a binary operator.
492;
493; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
494; variants, except on 32-bit system where the 64-bit accesses requires hand
495; coding.
496;
497; All the functions takes a pointer to the destination memory operand in A0,
498; the source register operand in A1 and a pointer to eflags in A2.
499;
500; @param 1 The instruction mnemonic.
501; @param 2 Non-zero if there should be a locked version.
502; @param 3 The modified flags.
503; @param 4 The undefined flags.
504;
505%macro IEMIMPL_BIN_OP 4
506BEGINCODE
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64
539 %endif ; RT_ARCH_AMD64
540
541 %if %2 != 0 ; locked versions requested?
542
543BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
544 PROLOGUE_3_ARGS
545 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
546 lock %1 byte [A0], A1_8
547 IEM_SAVE_FLAGS A2, %3, %4
548 EPILOGUE_3_ARGS
549ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
550
551BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
552 PROLOGUE_3_ARGS
553 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
554 lock %1 word [A0], A1_16
555 IEM_SAVE_FLAGS A2, %3, %4
556 EPILOGUE_3_ARGS
557ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
558
559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
560 PROLOGUE_3_ARGS
561 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
562 lock %1 dword [A0], A1_32
563 IEM_SAVE_FLAGS A2, %3, %4
564 EPILOGUE_3_ARGS
565ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
566
567 %ifdef RT_ARCH_AMD64
568BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
569 PROLOGUE_3_ARGS
570 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
571 lock %1 qword [A0], A1
572 IEM_SAVE_FLAGS A2, %3, %4
573 EPILOGUE_3_ARGS_EX 8
574ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
575 %endif ; RT_ARCH_AMD64
576 %endif ; locked
577%endmacro
578
579; instr,lock, modified-flags, undefined flags
580IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
581IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
582IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
583IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
584IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
585IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
586IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
587IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
588IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
589
590
591;;
592; Macro for implementing a binary operator, VEX variant with separate input/output.
593;
594; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
595; where the 64-bit accesses requires hand coding.
596;
597; All the functions takes a pointer to the destination memory operand in A0,
598; the first source register operand in A1, the second source register operand
599; in A2 and a pointer to eflags in A3.
600;
601; @param 1 The instruction mnemonic.
602; @param 2 The modified flags.
603; @param 3 The undefined flags.
604;
605%macro IEMIMPL_VEX_BIN_OP 3
606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
607 PROLOGUE_4_ARGS
608 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
609 %1 T0_32, A1_32, A2_32
610 mov [A0], T0_32
611 IEM_SAVE_FLAGS A3, %2, %3
612 EPILOGUE_4_ARGS
613ENDPROC iemAImpl_ %+ %1 %+ _u32
614
615 %ifdef RT_ARCH_AMD64
616BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
617 PROLOGUE_4_ARGS
618 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
619 %1 T0, A1, A2
620 mov [A0], T0
621 IEM_SAVE_FLAGS A3, %2, %3
622 EPILOGUE_4_ARGS
623ENDPROC iemAImpl_ %+ %1 %+ _u64
624 %endif ; RT_ARCH_AMD64
625%endmacro
626
627; instr, modified-flags, undefined-flags
628IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
629IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
630IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
631
632;;
633; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
634;
635; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
636; where the 64-bit accesses requires hand coding.
637;
638; All the functions takes a pointer to the destination memory operand in A0,
639; the source register operand in A1 and a pointer to eflags in A2.
640;
641; @param 1 The instruction mnemonic.
642; @param 2 The modified flags.
643; @param 3 The undefined flags.
644;
645%macro IEMIMPL_VEX_BIN_OP_2 3
646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
647 PROLOGUE_4_ARGS
648 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
649 mov T0_32, [A0]
650 %1 T0_32, A1_32
651 mov [A0], T0_32
652 IEM_SAVE_FLAGS A2, %2, %3
653 EPILOGUE_4_ARGS
654ENDPROC iemAImpl_ %+ %1 %+ _u32
655
656 %ifdef RT_ARCH_AMD64
657BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
658 PROLOGUE_4_ARGS
659 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
660 mov T0, [A0]
661 %1 T0, A1
662 mov [A0], T0
663 IEM_SAVE_FLAGS A2, %2, %3
664 EPILOGUE_4_ARGS
665ENDPROC iemAImpl_ %+ %1 %+ _u64
666 %endif ; RT_ARCH_AMD64
667%endmacro
668
669; instr, modified-flags, undefined-flags
670IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
671IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
672IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
673
674
675;;
676; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
677;
678; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
679; where the 64-bit accesses requires hand coding.
680;
681; All the functions takes a pointer to the destination memory operand in A0,
682; the first source register operand in A1, the second source register operand
683; in A2 and a pointer to eflags in A3.
684;
685; @param 1 The instruction mnemonic.
686; @param 2 Fallback instruction if applicable.
687; @param 3 Whether to emit fallback or not.
688;
689%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
690BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
691 PROLOGUE_3_ARGS
692 %1 T0_32, A1_32, A2_32
693 mov [A0], T0_32
694 EPILOGUE_3_ARGS
695ENDPROC iemAImpl_ %+ %1 %+ _u32
696
697 %if %3
698BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
699 PROLOGUE_3_ARGS
700 %ifdef ASM_CALL64_GCC
701 mov cl, A2_8
702 %2 A1_32, cl
703 mov [A0], A1_32
704 %else
705 xchg A2, A0
706 %2 A1_32, cl
707 mov [A2], A1_32
708 %endif
709 EPILOGUE_3_ARGS
710ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
711 %endif
712
713 %ifdef RT_ARCH_AMD64
714BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
715 PROLOGUE_3_ARGS
716 %1 T0, A1, A2
717 mov [A0], T0
718 EPILOGUE_3_ARGS
719ENDPROC iemAImpl_ %+ %1 %+ _u64
720
721 %if %3
722BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
723 PROLOGUE_3_ARGS
724 %ifdef ASM_CALL64_GCC
725 mov cl, A2_8
726 %2 A1, cl
727 mov [A0], A1_32
728 %else
729 xchg A2, A0
730 %2 A1, cl
731 mov [A2], A1_32
732 %endif
733 mov [A0], A1
734 EPILOGUE_3_ARGS
735ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
736 %endif
737 %endif ; RT_ARCH_AMD64
738%endmacro
739
740; instr, fallback instr, emit fallback
741IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
742IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
743IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
744IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
745IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
746
747
748;
749; RORX uses a immediate byte for the shift count, so we only do
750; fallback implementation of that one.
751;
752BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
753 PROLOGUE_3_ARGS
754 %ifdef ASM_CALL64_GCC
755 mov cl, A2_8
756 ror A1_32, cl
757 mov [A0], A1_32
758 %else
759 xchg A2, A0
760 ror A1_32, cl
761 mov [A2], A1_32
762 %endif
763 EPILOGUE_3_ARGS
764ENDPROC iemAImpl_rorx_u32
765
766 %ifdef RT_ARCH_AMD64
767BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
768 PROLOGUE_3_ARGS
769 %ifdef ASM_CALL64_GCC
770 mov cl, A2_8
771 ror A1, cl
772 mov [A0], A1
773 %else
774 xchg A2, A0
775 ror A1, cl
776 mov [A2], A1
777 %endif
778 EPILOGUE_3_ARGS
779ENDPROC iemAImpl_rorx_u64
780 %endif ; RT_ARCH_AMD64
781
782
783;
784; MULX
785;
786BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
787 PROLOGUE_4_ARGS
788%ifdef ASM_CALL64_GCC
789 ; A2_32 is EDX - prefect
790 mulx T0_32, T1_32, A3_32
791 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
792 mov [A0], T0_32
793%else
794 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
795 xchg A1, A2
796 mulx T0_32, T1_32, A3_32
797 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
798 mov [A0], T0_32
799%endif
800 EPILOGUE_4_ARGS
801ENDPROC iemAImpl_mulx_u32
802
803
804BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
805 PROLOGUE_4_ARGS
806%ifdef ASM_CALL64_GCC
807 ; A2_32 is EDX, T0_32 is EAX
808 mov eax, A3_32
809 mul A2_32
810 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%else
813 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
814 xchg A1, A2
815 mov eax, A3_32
816 mul A2_32
817 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
818 mov [A0], edx
819%endif
820 EPILOGUE_4_ARGS
821ENDPROC iemAImpl_mulx_u32_fallback
822
823%ifdef RT_ARCH_AMD64
824BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
825 PROLOGUE_4_ARGS
826%ifdef ASM_CALL64_GCC
827 ; A2 is RDX - prefect
828 mulx T0, T1, A3
829 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
830 mov [A0], T0
831%else
832 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
833 xchg A1, A2
834 mulx T0, T1, A3
835 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
836 mov [A0], T0
837%endif
838 EPILOGUE_4_ARGS
839ENDPROC iemAImpl_mulx_u64
840
841
842BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
843 PROLOGUE_4_ARGS
844%ifdef ASM_CALL64_GCC
845 ; A2 is RDX, T0 is RAX
846 mov rax, A3
847 mul A2
848 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%else
851 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
852 xchg A1, A2
853 mov rax, A3
854 mul A2
855 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
856 mov [A0], rdx
857%endif
858 EPILOGUE_4_ARGS
859ENDPROC iemAImpl_mulx_u64_fallback
860
861%endif
862
863
864;;
865; Macro for implementing a bit operator.
866;
867; This will generate code for the 16, 32 and 64 bit accesses with locked
868; variants, except on 32-bit system where the 64-bit accesses requires hand
869; coding.
870;
871; All the functions takes a pointer to the destination memory operand in A0,
872; the source register operand in A1 and a pointer to eflags in A2.
873;
874; @param 1 The instruction mnemonic.
875; @param 2 Non-zero if there should be a locked version.
876; @param 3 The modified flags.
877; @param 4 The undefined flags.
878;
879%macro IEMIMPL_BIT_OP 4
880BEGINCODE
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 %1 word [A0], A1_16
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS
887ENDPROC iemAImpl_ %+ %1 %+ _u16
888
889BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
890 PROLOGUE_3_ARGS
891 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
892 %1 dword [A0], A1_32
893 IEM_SAVE_FLAGS A2, %3, %4
894 EPILOGUE_3_ARGS
895ENDPROC iemAImpl_ %+ %1 %+ _u32
896
897 %ifdef RT_ARCH_AMD64
898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
899 PROLOGUE_3_ARGS
900 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
901 %1 qword [A0], A1
902 IEM_SAVE_FLAGS A2, %3, %4
903 EPILOGUE_3_ARGS_EX 8
904ENDPROC iemAImpl_ %+ %1 %+ _u64
905 %endif ; RT_ARCH_AMD64
906
907 %if %2 != 0 ; locked versions requested?
908
909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
910 PROLOGUE_3_ARGS
911 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
912 lock %1 word [A0], A1_16
913 IEM_SAVE_FLAGS A2, %3, %4
914 EPILOGUE_3_ARGS
915ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
916
917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
918 PROLOGUE_3_ARGS
919 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
920 lock %1 dword [A0], A1_32
921 IEM_SAVE_FLAGS A2, %3, %4
922 EPILOGUE_3_ARGS
923ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
924
925 %ifdef RT_ARCH_AMD64
926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
927 PROLOGUE_3_ARGS
928 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
929 lock %1 qword [A0], A1
930 IEM_SAVE_FLAGS A2, %3, %4
931 EPILOGUE_3_ARGS_EX 8
932ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
933 %endif ; RT_ARCH_AMD64
934 %endif ; locked
935%endmacro
936IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
937IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
938IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
939IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
940
941;;
942; Macro for implementing a bit search operator.
943;
944; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
945; system where the 64-bit accesses requires hand coding.
946;
947; All the functions takes a pointer to the destination memory operand in A0,
948; the source register operand in A1 and a pointer to eflags in A2.
949;
950; In the ZF case the destination register is 'undefined', however it seems that
951; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
952; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
953; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
954; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
955;
956; @param 1 The instruction mnemonic.
957; @param 2 The modified flags.
958; @param 3 The undefined flags.
959; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
960;
961%macro IEMIMPL_BIT_OP2 4
962BEGINCODE
963BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
964 PROLOGUE_3_ARGS
965 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
966 %1 T0_16, A1_16
967%if %4 != 0
968 jz .unchanged_dst
969%endif
970 mov [A0], T0_16
971.unchanged_dst:
972 IEM_SAVE_FLAGS A2, %2, %3
973 EPILOGUE_3_ARGS
974ENDPROC iemAImpl_ %+ %1 %+ _u16
975
976BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
977 PROLOGUE_3_ARGS
978 %1 T1_16, A1_16
979%if %4 != 0
980 jz .unchanged_dst
981%endif
982 mov [A0], T1_16
983 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
984 EPILOGUE_3_ARGS
985.unchanged_dst:
986 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
987 EPILOGUE_3_ARGS
988ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
989
990BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
991 PROLOGUE_3_ARGS
992 %1 T0_16, A1_16
993%if %4 != 0
994 jz .unchanged_dst
995%endif
996 mov [A0], T0_16
997.unchanged_dst:
998 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
999 EPILOGUE_3_ARGS
1000ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1001
1002
1003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1004 PROLOGUE_3_ARGS
1005 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1006 %1 T0_32, A1_32
1007%if %4 != 0
1008 jz .unchanged_dst
1009%endif
1010 mov [A0], T0_32
1011.unchanged_dst:
1012 IEM_SAVE_FLAGS A2, %2, %3
1013 EPILOGUE_3_ARGS
1014ENDPROC iemAImpl_ %+ %1 %+ _u32
1015
1016BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1017 PROLOGUE_3_ARGS
1018 %1 T1_32, A1_32
1019%if %4 != 0
1020 jz .unchanged_dst
1021%endif
1022 mov [A0], T1_32
1023 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1024 EPILOGUE_3_ARGS
1025.unchanged_dst:
1026 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1027 EPILOGUE_3_ARGS
1028ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1029
1030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1031 PROLOGUE_3_ARGS
1032 %1 T0_32, A1_32
1033%if %4 != 0
1034 jz .unchanged_dst
1035%endif
1036 mov [A0], T0_32
1037.unchanged_dst:
1038 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1039 EPILOGUE_3_ARGS
1040ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1041
1042
1043 %ifdef RT_ARCH_AMD64
1044
1045BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1046 PROLOGUE_3_ARGS
1047 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1048 %1 T0, A1
1049%if %4 != 0
1050 jz .unchanged_dst
1051%endif
1052 mov [A0], T0
1053.unchanged_dst:
1054 IEM_SAVE_FLAGS A2, %2, %3
1055 EPILOGUE_3_ARGS_EX 8
1056ENDPROC iemAImpl_ %+ %1 %+ _u64
1057
1058BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1059 PROLOGUE_3_ARGS
1060 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1061 %1 T1, A1
1062%if %4 != 0
1063 jz .unchanged_dst
1064%endif
1065 mov [A0], T1
1066 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1067 EPILOGUE_3_ARGS
1068.unchanged_dst:
1069 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1070 EPILOGUE_3_ARGS
1071ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1072
1073BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1074 PROLOGUE_3_ARGS
1075 %1 T0, A1
1076%if %4 != 0
1077 jz .unchanged_dst
1078%endif
1079 mov [A0], T0
1080.unchanged_dst:
1081 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1082 EPILOGUE_3_ARGS_EX 8
1083ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1084
1085 %endif ; RT_ARCH_AMD64
1086%endmacro
1087
1088IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1089IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1090IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1091IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1092
1093
1094;;
1095; Macro for implementing POPCNT.
1096;
1097; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1098; system where the 64-bit accesses requires hand coding.
1099;
1100; All the functions takes a pointer to the destination memory operand in A0,
1101; the source register operand in A1 and a pointer to eflags in A2.
1102;
1103; ASSUMES Intel and AMD set EFLAGS the same way.
1104;
1105; ASSUMES the instruction does not support memory destination.
1106;
1107; @param 1 The instruction mnemonic.
1108; @param 2 The modified flags.
1109; @param 3 The undefined flags.
1110;
1111%macro IEMIMPL_BIT_OP3 3
1112BEGINCODE
1113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1114 PROLOGUE_3_ARGS
1115 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1116 %1 T0_16, A1_16
1117 mov [A0], T0_16
1118 IEM_SAVE_FLAGS A2, %2, %3
1119 EPILOGUE_3_ARGS
1120ENDPROC iemAImpl_ %+ %1 %+ _u16
1121
1122BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1123 PROLOGUE_3_ARGS
1124 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1125 %1 T0_32, A1_32
1126 mov [A0], T0_32
1127 IEM_SAVE_FLAGS A2, %2, %3
1128 EPILOGUE_3_ARGS
1129ENDPROC iemAImpl_ %+ %1 %+ _u32
1130
1131 %ifdef RT_ARCH_AMD64
1132BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1133 PROLOGUE_3_ARGS
1134 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1135 %1 T0, A1
1136 mov [A0], T0
1137 IEM_SAVE_FLAGS A2, %2, %3
1138 EPILOGUE_3_ARGS_EX 8
1139ENDPROC iemAImpl_ %+ %1 %+ _u64
1140 %endif ; RT_ARCH_AMD64
1141%endmacro
1142IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1143
1144
1145;
1146; IMUL is also a similar but yet different case (no lock, no mem dst).
1147; The rDX:rAX variant of imul is handled together with mul further down.
1148;
1149BEGINCODE
1150; @param 1 EFLAGS that are modified.
1151; @param 2 Undefined EFLAGS.
1152; @param 3 Function suffix.
1153; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1154; 2 for AMD (set AF, clear PF, ZF and SF).
1155%macro IEMIMPL_IMUL_TWO 4
1156BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1157 PROLOGUE_3_ARGS
1158 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1159 imul A1_16, word [A0]
1160 mov [A0], A1_16
1161 %if %4 != 1
1162 IEM_SAVE_FLAGS A2, %1, %2
1163 %else
1164 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1165 %endif
1166 EPILOGUE_3_ARGS
1167ENDPROC iemAImpl_imul_two_u16 %+ %3
1168
1169BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1170 PROLOGUE_3_ARGS
1171 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1172 imul A1_32, dword [A0]
1173 mov [A0], A1_32
1174 %if %4 != 1
1175 IEM_SAVE_FLAGS A2, %1, %2
1176 %else
1177 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1178 %endif
1179 EPILOGUE_3_ARGS
1180ENDPROC iemAImpl_imul_two_u32 %+ %3
1181
1182 %ifdef RT_ARCH_AMD64
1183BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1184 PROLOGUE_3_ARGS
1185 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1186 imul A1, qword [A0]
1187 mov [A0], A1
1188 %if %4 != 1
1189 IEM_SAVE_FLAGS A2, %1, %2
1190 %else
1191 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1192 %endif
1193 EPILOGUE_3_ARGS_EX 8
1194ENDPROC iemAImpl_imul_two_u64 %+ %3
1195 %endif ; RT_ARCH_AMD64
1196%endmacro
1197IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1198IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1199IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1200
1201
1202;
1203; XCHG for memory operands. This implies locking. No flag changes.
1204;
1205; Each function takes two arguments, first the pointer to the memory,
1206; then the pointer to the register. They all return void.
1207;
1208BEGINCODE
1209BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1210 PROLOGUE_2_ARGS
1211 mov T0_8, [A1]
1212 xchg [A0], T0_8
1213 mov [A1], T0_8
1214 EPILOGUE_2_ARGS
1215ENDPROC iemAImpl_xchg_u8_locked
1216
1217BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1218 PROLOGUE_2_ARGS
1219 mov T0_16, [A1]
1220 xchg [A0], T0_16
1221 mov [A1], T0_16
1222 EPILOGUE_2_ARGS
1223ENDPROC iemAImpl_xchg_u16_locked
1224
1225BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1226 PROLOGUE_2_ARGS
1227 mov T0_32, [A1]
1228 xchg [A0], T0_32
1229 mov [A1], T0_32
1230 EPILOGUE_2_ARGS
1231ENDPROC iemAImpl_xchg_u32_locked
1232
1233%ifdef RT_ARCH_AMD64
1234BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1235 PROLOGUE_2_ARGS
1236 mov T0, [A1]
1237 xchg [A0], T0
1238 mov [A1], T0
1239 EPILOGUE_2_ARGS
1240ENDPROC iemAImpl_xchg_u64_locked
1241%endif
1242
1243; Unlocked variants for fDisregardLock mode.
1244
1245BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1246 PROLOGUE_2_ARGS
1247 mov T0_8, [A1]
1248 mov T1_8, [A0]
1249 mov [A0], T0_8
1250 mov [A1], T1_8
1251 EPILOGUE_2_ARGS
1252ENDPROC iemAImpl_xchg_u8_unlocked
1253
1254BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1255 PROLOGUE_2_ARGS
1256 mov T0_16, [A1]
1257 mov T1_16, [A0]
1258 mov [A0], T0_16
1259 mov [A1], T1_16
1260 EPILOGUE_2_ARGS
1261ENDPROC iemAImpl_xchg_u16_unlocked
1262
1263BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1264 PROLOGUE_2_ARGS
1265 mov T0_32, [A1]
1266 mov T1_32, [A0]
1267 mov [A0], T0_32
1268 mov [A1], T1_32
1269 EPILOGUE_2_ARGS
1270ENDPROC iemAImpl_xchg_u32_unlocked
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1274 PROLOGUE_2_ARGS
1275 mov T0, [A1]
1276 mov T1, [A0]
1277 mov [A0], T0
1278 mov [A1], T1
1279 EPILOGUE_2_ARGS
1280ENDPROC iemAImpl_xchg_u64_unlocked
1281%endif
1282
1283
1284;
1285; XADD for memory operands.
1286;
1287; Each function takes three arguments, first the pointer to the
1288; memory/register, then the pointer to the register, and finally a pointer to
1289; eflags. They all return void.
1290;
1291BEGINCODE
1292BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1293 PROLOGUE_3_ARGS
1294 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 mov T0_8, [A1]
1296 xadd [A0], T0_8
1297 mov [A1], T0_8
1298 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1299 EPILOGUE_3_ARGS
1300ENDPROC iemAImpl_xadd_u8
1301
1302BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1303 PROLOGUE_3_ARGS
1304 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1305 mov T0_16, [A1]
1306 xadd [A0], T0_16
1307 mov [A1], T0_16
1308 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1309 EPILOGUE_3_ARGS
1310ENDPROC iemAImpl_xadd_u16
1311
1312BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1313 PROLOGUE_3_ARGS
1314 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1315 mov T0_32, [A1]
1316 xadd [A0], T0_32
1317 mov [A1], T0_32
1318 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1319 EPILOGUE_3_ARGS
1320ENDPROC iemAImpl_xadd_u32
1321
1322%ifdef RT_ARCH_AMD64
1323BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1324 PROLOGUE_3_ARGS
1325 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 mov T0, [A1]
1327 xadd [A0], T0
1328 mov [A1], T0
1329 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1330 EPILOGUE_3_ARGS
1331ENDPROC iemAImpl_xadd_u64
1332%endif ; RT_ARCH_AMD64
1333
1334BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1335 PROLOGUE_3_ARGS
1336 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 mov T0_8, [A1]
1338 lock xadd [A0], T0_8
1339 mov [A1], T0_8
1340 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1341 EPILOGUE_3_ARGS
1342ENDPROC iemAImpl_xadd_u8_locked
1343
1344BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1345 PROLOGUE_3_ARGS
1346 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1347 mov T0_16, [A1]
1348 lock xadd [A0], T0_16
1349 mov [A1], T0_16
1350 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1351 EPILOGUE_3_ARGS
1352ENDPROC iemAImpl_xadd_u16_locked
1353
1354BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1355 PROLOGUE_3_ARGS
1356 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1357 mov T0_32, [A1]
1358 lock xadd [A0], T0_32
1359 mov [A1], T0_32
1360 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1361 EPILOGUE_3_ARGS
1362ENDPROC iemAImpl_xadd_u32_locked
1363
1364%ifdef RT_ARCH_AMD64
1365BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1366 PROLOGUE_3_ARGS
1367 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1368 mov T0, [A1]
1369 lock xadd [A0], T0
1370 mov [A1], T0
1371 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1372 EPILOGUE_3_ARGS
1373ENDPROC iemAImpl_xadd_u64_locked
1374%endif ; RT_ARCH_AMD64
1375
1376
1377;
1378; CMPXCHG8B.
1379;
1380; These are tricky register wise, so the code is duplicated for each calling
1381; convention.
1382;
1383; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1384;
1385; C-proto:
1386; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1387; uint32_t *pEFlags));
1388;
1389; Note! Identical to iemAImpl_cmpxchg16b.
1390;
1391BEGINCODE
1392BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1393%ifdef RT_ARCH_AMD64
1394 %ifdef ASM_CALL64_MSC
1395 push rbx
1396
1397 mov r11, rdx ; pu64EaxEdx (is also T1)
1398 mov r10, rcx ; pu64Dst
1399
1400 mov ebx, [r8]
1401 mov ecx, [r8 + 4]
1402 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1403 mov eax, [r11]
1404 mov edx, [r11 + 4]
1405
1406 lock cmpxchg8b [r10]
1407
1408 mov [r11], eax
1409 mov [r11 + 4], edx
1410 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1411
1412 pop rbx
1413 ret
1414 %else
1415 push rbx
1416
1417 mov r10, rcx ; pEFlags
1418 mov r11, rdx ; pu64EbxEcx (is also T1)
1419
1420 mov ebx, [r11]
1421 mov ecx, [r11 + 4]
1422 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1423 mov eax, [rsi]
1424 mov edx, [rsi + 4]
1425
1426 lock cmpxchg8b [rdi]
1427
1428 mov [rsi], eax
1429 mov [rsi + 4], edx
1430 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1431
1432 pop rbx
1433 ret
1434
1435 %endif
1436%else
1437 push esi
1438 push edi
1439 push ebx
1440 push ebp
1441
1442 mov edi, ecx ; pu64Dst
1443 mov esi, edx ; pu64EaxEdx
1444 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1445 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1446
1447 mov ebx, [ecx]
1448 mov ecx, [ecx + 4]
1449 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1450 mov eax, [esi]
1451 mov edx, [esi + 4]
1452
1453 lock cmpxchg8b [edi]
1454
1455 mov [esi], eax
1456 mov [esi + 4], edx
1457 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1458
1459 pop ebp
1460 pop ebx
1461 pop edi
1462 pop esi
1463 ret 8
1464%endif
1465ENDPROC iemAImpl_cmpxchg8b
1466
1467BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1468 ; Lazy bird always lock prefixes cmpxchg8b.
1469 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1470ENDPROC iemAImpl_cmpxchg8b_locked
1471
1472%ifdef RT_ARCH_AMD64
1473
1474;
1475; CMPXCHG16B.
1476;
1477; These are tricky register wise, so the code is duplicated for each calling
1478; convention.
1479;
1480; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1481;
1482; C-proto:
1483; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1484; uint32_t *pEFlags));
1485;
1486; Note! Identical to iemAImpl_cmpxchg8b.
1487;
1488BEGINCODE
1489BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1490 %ifdef ASM_CALL64_MSC
1491 push rbx
1492
1493 mov r11, rdx ; pu64RaxRdx (is also T1)
1494 mov r10, rcx ; pu64Dst
1495
1496 mov rbx, [r8]
1497 mov rcx, [r8 + 8]
1498 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1499 mov rax, [r11]
1500 mov rdx, [r11 + 8]
1501
1502 lock cmpxchg16b [r10]
1503
1504 mov [r11], rax
1505 mov [r11 + 8], rdx
1506 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1507
1508 pop rbx
1509 ret
1510 %else
1511 push rbx
1512
1513 mov r10, rcx ; pEFlags
1514 mov r11, rdx ; pu64RbxRcx (is also T1)
1515
1516 mov rbx, [r11]
1517 mov rcx, [r11 + 8]
1518 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1519 mov rax, [rsi]
1520 mov rdx, [rsi + 8]
1521
1522 lock cmpxchg16b [rdi]
1523
1524 mov [rsi], rax
1525 mov [rsi + 8], rdx
1526 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1527
1528 pop rbx
1529 ret
1530
1531 %endif
1532ENDPROC iemAImpl_cmpxchg16b
1533
1534BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1535 ; Lazy bird always lock prefixes cmpxchg16b.
1536 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1537ENDPROC iemAImpl_cmpxchg16b_locked
1538
1539%endif ; RT_ARCH_AMD64
1540
1541
1542;
1543; CMPXCHG.
1544;
1545; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1546;
1547; C-proto:
1548; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1549;
1550BEGINCODE
1551%macro IEMIMPL_CMPXCHG 2
1552BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1553 PROLOGUE_4_ARGS
1554 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1555 mov al, [A1]
1556 %1 cmpxchg [A0], A2_8
1557 mov [A1], al
1558 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1559 EPILOGUE_4_ARGS
1560ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1561
1562BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1563 PROLOGUE_4_ARGS
1564 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1565 mov ax, [A1]
1566 %1 cmpxchg [A0], A2_16
1567 mov [A1], ax
1568 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1569 EPILOGUE_4_ARGS
1570ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1571
1572BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1573 PROLOGUE_4_ARGS
1574 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1575 mov eax, [A1]
1576 %1 cmpxchg [A0], A2_32
1577 mov [A1], eax
1578 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1579 EPILOGUE_4_ARGS
1580ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1581
1582BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1583%ifdef RT_ARCH_AMD64
1584 PROLOGUE_4_ARGS
1585 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1586 mov rax, [A1]
1587 %1 cmpxchg [A0], A2
1588 mov [A1], rax
1589 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1590 EPILOGUE_4_ARGS
1591%else
1592 ;
1593 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1594 ;
1595 push esi
1596 push edi
1597 push ebx
1598 push ebp
1599
1600 mov edi, ecx ; pu64Dst
1601 mov esi, edx ; pu64Rax
1602 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1603 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1604
1605 mov ebx, [ecx]
1606 mov ecx, [ecx + 4]
1607 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1608 mov eax, [esi]
1609 mov edx, [esi + 4]
1610
1611 lock cmpxchg8b [edi]
1612
1613 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1614 jz .cmpxchg8b_not_equal
1615 cmp eax, eax ; just set the other flags.
1616.store:
1617 mov [esi], eax
1618 mov [esi + 4], edx
1619 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1620
1621 pop ebp
1622 pop ebx
1623 pop edi
1624 pop esi
1625 ret 8
1626
1627.cmpxchg8b_not_equal:
1628 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1629 jne .store
1630 cmp [esi], eax
1631 jmp .store
1632
1633%endif
1634ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1635%endmacro ; IEMIMPL_CMPXCHG
1636
1637IEMIMPL_CMPXCHG , ,
1638IEMIMPL_CMPXCHG lock, _locked
1639
1640;;
1641; Macro for implementing a unary operator.
1642;
1643; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1644; variants, except on 32-bit system where the 64-bit accesses requires hand
1645; coding.
1646;
1647; All the functions takes a pointer to the destination memory operand in A0,
1648; the source register operand in A1 and a pointer to eflags in A2.
1649;
1650; @param 1 The instruction mnemonic.
1651; @param 2 The modified flags.
1652; @param 3 The undefined flags.
1653;
1654%macro IEMIMPL_UNARY_OP 3
1655BEGINCODE
1656BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1657 PROLOGUE_2_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1659 %1 byte [A0]
1660 IEM_SAVE_FLAGS A1, %2, %3
1661 EPILOGUE_2_ARGS
1662ENDPROC iemAImpl_ %+ %1 %+ _u8
1663
1664BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1665 PROLOGUE_2_ARGS
1666 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1667 lock %1 byte [A0]
1668 IEM_SAVE_FLAGS A1, %2, %3
1669 EPILOGUE_2_ARGS
1670ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1671
1672BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1673 PROLOGUE_2_ARGS
1674 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1675 %1 word [A0]
1676 IEM_SAVE_FLAGS A1, %2, %3
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_ %+ %1 %+ _u16
1679
1680BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1683 lock %1 word [A0]
1684 IEM_SAVE_FLAGS A1, %2, %3
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1689 PROLOGUE_2_ARGS
1690 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1691 %1 dword [A0]
1692 IEM_SAVE_FLAGS A1, %2, %3
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_ %+ %1 %+ _u32
1695
1696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1697 PROLOGUE_2_ARGS
1698 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1699 lock %1 dword [A0]
1700 IEM_SAVE_FLAGS A1, %2, %3
1701 EPILOGUE_2_ARGS
1702ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1703
1704 %ifdef RT_ARCH_AMD64
1705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1706 PROLOGUE_2_ARGS
1707 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1708 %1 qword [A0]
1709 IEM_SAVE_FLAGS A1, %2, %3
1710 EPILOGUE_2_ARGS
1711ENDPROC iemAImpl_ %+ %1 %+ _u64
1712
1713BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1714 PROLOGUE_2_ARGS
1715 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1716 lock %1 qword [A0]
1717 IEM_SAVE_FLAGS A1, %2, %3
1718 EPILOGUE_2_ARGS
1719ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1720 %endif ; RT_ARCH_AMD64
1721
1722%endmacro
1723
1724IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1725IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1726IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1727IEMIMPL_UNARY_OP not, 0, 0
1728
1729
1730;
1731; BSWAP. No flag changes.
1732;
1733; Each function takes one argument, pointer to the value to bswap
1734; (input/output). They all return void.
1735;
1736BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1737 PROLOGUE_1_ARGS
1738 mov T0_32, [A0] ; just in case any of the upper bits are used.
1739 db 66h
1740 bswap T0_32
1741 mov [A0], T0_32
1742 EPILOGUE_1_ARGS
1743ENDPROC iemAImpl_bswap_u16
1744
1745BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1746 PROLOGUE_1_ARGS
1747 mov T0_32, [A0]
1748 bswap T0_32
1749 mov [A0], T0_32
1750 EPILOGUE_1_ARGS
1751ENDPROC iemAImpl_bswap_u32
1752
1753BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1754%ifdef RT_ARCH_AMD64
1755 PROLOGUE_1_ARGS
1756 mov T0, [A0]
1757 bswap T0
1758 mov [A0], T0
1759 EPILOGUE_1_ARGS
1760%else
1761 PROLOGUE_1_ARGS
1762 mov T0, [A0]
1763 mov T1, [A0 + 4]
1764 bswap T0
1765 bswap T1
1766 mov [A0 + 4], T0
1767 mov [A0], T1
1768 EPILOGUE_1_ARGS
1769%endif
1770ENDPROC iemAImpl_bswap_u64
1771
1772
1773;;
1774; Macro for implementing a shift operation.
1775;
1776; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1777; 32-bit system where the 64-bit accesses requires hand coding.
1778;
1779; All the functions takes a pointer to the destination memory operand in A0,
1780; the shift count in A1 and a pointer to eflags in A2.
1781;
1782; @param 1 The instruction mnemonic.
1783; @param 2 The modified flags.
1784; @param 3 The undefined flags.
1785;
1786; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1787;
1788; @note the _intel and _amd variants are implemented in C.
1789;
1790%macro IEMIMPL_SHIFT_OP 3
1791BEGINCODE
1792BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1793 PROLOGUE_3_ARGS
1794 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1795 %ifdef ASM_CALL64_GCC
1796 mov cl, A1_8
1797 %1 byte [A0], cl
1798 %else
1799 xchg A1, A0
1800 %1 byte [A1], cl
1801 %endif
1802 IEM_SAVE_FLAGS A2, %2, %3
1803 EPILOGUE_3_ARGS
1804ENDPROC iemAImpl_ %+ %1 %+ _u8
1805
1806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1807 PROLOGUE_3_ARGS
1808 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1809 %ifdef ASM_CALL64_GCC
1810 mov cl, A1_8
1811 %1 word [A0], cl
1812 %else
1813 xchg A1, A0
1814 %1 word [A1], cl
1815 %endif
1816 IEM_SAVE_FLAGS A2, %2, %3
1817 EPILOGUE_3_ARGS
1818ENDPROC iemAImpl_ %+ %1 %+ _u16
1819
1820BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1821 PROLOGUE_3_ARGS
1822 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1823 %ifdef ASM_CALL64_GCC
1824 mov cl, A1_8
1825 %1 dword [A0], cl
1826 %else
1827 xchg A1, A0
1828 %1 dword [A1], cl
1829 %endif
1830 IEM_SAVE_FLAGS A2, %2, %3
1831 EPILOGUE_3_ARGS
1832ENDPROC iemAImpl_ %+ %1 %+ _u32
1833
1834 %ifdef RT_ARCH_AMD64
1835BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1836 PROLOGUE_3_ARGS
1837 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1838 %ifdef ASM_CALL64_GCC
1839 mov cl, A1_8
1840 %1 qword [A0], cl
1841 %else
1842 xchg A1, A0
1843 %1 qword [A1], cl
1844 %endif
1845 IEM_SAVE_FLAGS A2, %2, %3
1846 EPILOGUE_3_ARGS
1847ENDPROC iemAImpl_ %+ %1 %+ _u64
1848 %endif ; RT_ARCH_AMD64
1849
1850%endmacro
1851
1852IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1853IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1854IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1855IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1856IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1857IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1858IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1859
1860
1861;;
1862; Macro for implementing a double precision shift operation.
1863;
1864; This will generate code for the 16, 32 and 64 bit accesses, except on
1865; 32-bit system where the 64-bit accesses requires hand coding.
1866;
1867; The functions takes the destination operand (r/m) in A0, the source (reg) in
1868; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1869;
1870; @param 1 The instruction mnemonic.
1871; @param 2 The modified flags.
1872; @param 3 The undefined flags.
1873;
1874; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1875;
1876; @note the _intel and _amd variants are implemented in C.
1877;
1878%macro IEMIMPL_SHIFT_DBL_OP 3
1879BEGINCODE
1880BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1881 PROLOGUE_4_ARGS
1882 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1883 %ifdef ASM_CALL64_GCC
1884 xchg A3, A2
1885 %1 [A0], A1_16, cl
1886 xchg A3, A2
1887 %else
1888 xchg A0, A2
1889 %1 [A2], A1_16, cl
1890 %endif
1891 IEM_SAVE_FLAGS A3, %2, %3
1892 EPILOGUE_4_ARGS
1893ENDPROC iemAImpl_ %+ %1 %+ _u16
1894
1895BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1896 PROLOGUE_4_ARGS
1897 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1898 %ifdef ASM_CALL64_GCC
1899 xchg A3, A2
1900 %1 [A0], A1_32, cl
1901 xchg A3, A2
1902 %else
1903 xchg A0, A2
1904 %1 [A2], A1_32, cl
1905 %endif
1906 IEM_SAVE_FLAGS A3, %2, %3
1907 EPILOGUE_4_ARGS
1908ENDPROC iemAImpl_ %+ %1 %+ _u32
1909
1910 %ifdef RT_ARCH_AMD64
1911BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1912 PROLOGUE_4_ARGS
1913 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1914 %ifdef ASM_CALL64_GCC
1915 xchg A3, A2
1916 %1 [A0], A1, cl
1917 xchg A3, A2
1918 %else
1919 xchg A0, A2
1920 %1 [A2], A1, cl
1921 %endif
1922 IEM_SAVE_FLAGS A3, %2, %3
1923 EPILOGUE_4_ARGS_EX 12
1924ENDPROC iemAImpl_ %+ %1 %+ _u64
1925 %endif ; RT_ARCH_AMD64
1926
1927%endmacro
1928
1929IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1930IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1931
1932
1933;;
1934; Macro for implementing a multiplication operations.
1935;
1936; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1937; 32-bit system where the 64-bit accesses requires hand coding.
1938;
1939; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1940; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1941; pointer to eflags in A3.
1942;
1943; The functions all return 0 so the caller can be used for div/idiv as well as
1944; for the mul/imul implementation.
1945;
1946; @param 1 The instruction mnemonic.
1947; @param 2 The modified flags.
1948; @param 3 The undefined flags.
1949; @param 4 Name suffix.
1950; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1951;
1952; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1953;
1954%macro IEMIMPL_MUL_OP 5
1955BEGINCODE
1956BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1957 PROLOGUE_3_ARGS
1958 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1959 mov al, [A0]
1960 %1 A1_8
1961 mov [A0], ax
1962 %if %5 != 1
1963 IEM_SAVE_FLAGS A2, %2, %3
1964 %else
1965 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1966 %endif
1967 xor eax, eax
1968 EPILOGUE_3_ARGS
1969ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1970
1971BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1972 PROLOGUE_4_ARGS
1973 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1974 mov ax, [A0]
1975 %ifdef ASM_CALL64_GCC
1976 %1 A2_16
1977 mov [A0], ax
1978 mov [A1], dx
1979 %else
1980 mov T1, A1
1981 %1 A2_16
1982 mov [A0], ax
1983 mov [T1], dx
1984 %endif
1985 %if %5 != 1
1986 IEM_SAVE_FLAGS A3, %2, %3
1987 %else
1988 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1989 %endif
1990 xor eax, eax
1991 EPILOGUE_4_ARGS
1992ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1993
1994BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1995 PROLOGUE_4_ARGS
1996 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1997 mov eax, [A0]
1998 %ifdef ASM_CALL64_GCC
1999 %1 A2_32
2000 mov [A0], eax
2001 mov [A1], edx
2002 %else
2003 mov T1, A1
2004 %1 A2_32
2005 mov [A0], eax
2006 mov [T1], edx
2007 %endif
2008 %if %5 != 1
2009 IEM_SAVE_FLAGS A3, %2, %3
2010 %else
2011 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2012 %endif
2013 xor eax, eax
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2016
2017 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 mov rax, [A0]
2022 %ifdef ASM_CALL64_GCC
2023 %1 A2
2024 mov [A0], rax
2025 mov [A1], rdx
2026 %else
2027 mov T1, A1
2028 %1 A2
2029 mov [A0], rax
2030 mov [T1], rdx
2031 %endif
2032 %if %5 != 1
2033 IEM_SAVE_FLAGS A3, %2, %3
2034 %else
2035 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2036 %endif
2037 xor eax, eax
2038 EPILOGUE_4_ARGS_EX 12
2039ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2040 %endif ; !RT_ARCH_AMD64
2041
2042%endmacro
2043
2044IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2045IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2046IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2047IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2048IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2049IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2050
2051
2052BEGINCODE
2053;;
2054; Worker function for negating a 32-bit number in T1:T0
2055; @uses None (T0,T1)
2056BEGINPROC iemAImpl_negate_T0_T1_u32
2057 push 0
2058 push 0
2059 xchg T0_32, [xSP]
2060 xchg T1_32, [xSP + xCB]
2061 sub T0_32, [xSP]
2062 sbb T1_32, [xSP + xCB]
2063 add xSP, xCB*2
2064 ret
2065ENDPROC iemAImpl_negate_T0_T1_u32
2066
2067%ifdef RT_ARCH_AMD64
2068;;
2069; Worker function for negating a 64-bit number in T1:T0
2070; @uses None (T0,T1)
2071BEGINPROC iemAImpl_negate_T0_T1_u64
2072 push 0
2073 push 0
2074 xchg T0, [xSP]
2075 xchg T1, [xSP + xCB]
2076 sub T0, [xSP]
2077 sbb T1, [xSP + xCB]
2078 add xSP, xCB*2
2079 ret
2080ENDPROC iemAImpl_negate_T0_T1_u64
2081%endif
2082
2083
2084;;
2085; Macro for implementing a division operations.
2086;
2087; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2088; 32-bit system where the 64-bit accesses requires hand coding.
2089;
2090; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2091; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2092; pointer to eflags in A3.
2093;
2094; The functions all return 0 on success and -1 if a divide error should be
2095; raised by the caller.
2096;
2097; @param 1 The instruction mnemonic.
2098; @param 2 The modified flags.
2099; @param 3 The undefined flags.
2100; @param 4 1 if signed, 0 if unsigned.
2101; @param 5 Function suffix.
2102; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2103; 2 for AMD (set AF, clear PF, ZF and SF).
2104;
2105; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2106;
2107%macro IEMIMPL_DIV_OP 6
2108BEGINCODE
2109BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2110 PROLOGUE_3_ARGS
2111
2112 ; div by chainsaw check.
2113 test A1_8, A1_8
2114 jz .div_zero
2115
2116 ; Overflow check - unsigned division is simple to verify, haven't
2117 ; found a simple way to check signed division yet unfortunately.
2118 %if %4 == 0
2119 cmp [A0 + 1], A1_8
2120 jae .div_overflow
2121 %else
2122 mov T0_16, [A0] ; T0 = dividend
2123 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2124 test A1_8, A1_8
2125 js .divisor_negative
2126 test T0_16, T0_16
2127 jns .both_positive
2128 neg T0_16
2129.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2130 push T0 ; Start off like unsigned below.
2131 shr T0_16, 7
2132 cmp T0_8, A1_8
2133 pop T0
2134 jb .div_no_overflow
2135 ja .div_overflow
2136 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2137 cmp T0_8, A1_8
2138 jae .div_overflow
2139 jmp .div_no_overflow
2140
2141.divisor_negative:
2142 neg A1_8
2143 test T0_16, T0_16
2144 jns .one_of_each
2145 neg T0_16
2146.both_positive: ; Same as unsigned shifted by sign indicator bit.
2147 shr T0_16, 7
2148 cmp T0_8, A1_8
2149 jae .div_overflow
2150.div_no_overflow:
2151 mov A1, T1 ; restore divisor
2152 %endif
2153
2154 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2155 mov ax, [A0]
2156 %1 A1_8
2157 mov [A0], ax
2158 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2159 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2160 %else
2161 IEM_SAVE_FLAGS A2, %2, %3
2162 %endif
2163 xor eax, eax
2164
2165.return:
2166 EPILOGUE_3_ARGS
2167
2168.div_zero:
2169.div_overflow:
2170 mov eax, -1
2171 jmp .return
2172ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2173
2174BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2175 PROLOGUE_4_ARGS
2176
2177 ; div by chainsaw check.
2178 test A2_16, A2_16
2179 jz .div_zero
2180
2181 ; Overflow check - unsigned division is simple to verify, haven't
2182 ; found a simple way to check signed division yet unfortunately.
2183 %if %4 == 0
2184 cmp [A1], A2_16
2185 jae .div_overflow
2186 %else
2187 mov T0_16, [A1]
2188 shl T0_32, 16
2189 mov T0_16, [A0] ; T0 = dividend
2190 mov T1, A2 ; T1 = divisor
2191 test T1_16, T1_16
2192 js .divisor_negative
2193 test T0_32, T0_32
2194 jns .both_positive
2195 neg T0_32
2196.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2197 push T0 ; Start off like unsigned below.
2198 shr T0_32, 15
2199 cmp T0_16, T1_16
2200 pop T0
2201 jb .div_no_overflow
2202 ja .div_overflow
2203 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2204 cmp T0_16, T1_16
2205 jae .div_overflow
2206 jmp .div_no_overflow
2207
2208.divisor_negative:
2209 neg T1_16
2210 test T0_32, T0_32
2211 jns .one_of_each
2212 neg T0_32
2213.both_positive: ; Same as unsigned shifted by sign indicator bit.
2214 shr T0_32, 15
2215 cmp T0_16, T1_16
2216 jae .div_overflow
2217.div_no_overflow:
2218 %endif
2219
2220 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2221 %ifdef ASM_CALL64_GCC
2222 mov T1, A2
2223 mov ax, [A0]
2224 mov dx, [A1]
2225 %1 T1_16
2226 mov [A0], ax
2227 mov [A1], dx
2228 %else
2229 mov T1, A1
2230 mov ax, [A0]
2231 mov dx, [T1]
2232 %1 A2_16
2233 mov [A0], ax
2234 mov [T1], dx
2235 %endif
2236 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2237 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2238 %else
2239 IEM_SAVE_FLAGS A3, %2, %3
2240 %endif
2241 xor eax, eax
2242
2243.return:
2244 EPILOGUE_4_ARGS
2245
2246.div_zero:
2247.div_overflow:
2248 mov eax, -1
2249 jmp .return
2250ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2251
2252BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2253 PROLOGUE_4_ARGS
2254
2255 ; div by chainsaw check.
2256 test A2_32, A2_32
2257 jz .div_zero
2258
2259 ; Overflow check - unsigned division is simple to verify, haven't
2260 ; found a simple way to check signed division yet unfortunately.
2261 %if %4 == 0
2262 cmp [A1], A2_32
2263 jae .div_overflow
2264 %else
2265 push A2 ; save A2 so we modify it (we out of regs on x86).
2266 mov T0_32, [A0] ; T0 = dividend low
2267 mov T1_32, [A1] ; T1 = dividend high
2268 test A2_32, A2_32
2269 js .divisor_negative
2270 test T1_32, T1_32
2271 jns .both_positive
2272 call NAME(iemAImpl_negate_T0_T1_u32)
2273.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2274 push T0 ; Start off like unsigned below.
2275 shl T1_32, 1
2276 shr T0_32, 31
2277 or T1_32, T0_32
2278 cmp T1_32, A2_32
2279 pop T0
2280 jb .div_no_overflow
2281 ja .div_overflow
2282 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2283 cmp T0_32, A2_32
2284 jae .div_overflow
2285 jmp .div_no_overflow
2286
2287.divisor_negative:
2288 neg A2_32
2289 test T1_32, T1_32
2290 jns .one_of_each
2291 call NAME(iemAImpl_negate_T0_T1_u32)
2292.both_positive: ; Same as unsigned shifted by sign indicator bit.
2293 shl T1_32, 1
2294 shr T0_32, 31
2295 or T1_32, T0_32
2296 cmp T1_32, A2_32
2297 jae .div_overflow
2298.div_no_overflow:
2299 pop A2
2300 %endif
2301
2302 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2303 mov eax, [A0]
2304 %ifdef ASM_CALL64_GCC
2305 mov T1, A2
2306 mov eax, [A0]
2307 mov edx, [A1]
2308 %1 T1_32
2309 mov [A0], eax
2310 mov [A1], edx
2311 %else
2312 mov T1, A1
2313 mov eax, [A0]
2314 mov edx, [T1]
2315 %1 A2_32
2316 mov [A0], eax
2317 mov [T1], edx
2318 %endif
2319 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2320 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2321 %else
2322 IEM_SAVE_FLAGS A3, %2, %3
2323 %endif
2324 xor eax, eax
2325
2326.return:
2327 EPILOGUE_4_ARGS
2328
2329.div_overflow:
2330 %if %4 != 0
2331 pop A2
2332 %endif
2333.div_zero:
2334 mov eax, -1
2335 jmp .return
2336ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2337
2338 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2339BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2340 PROLOGUE_4_ARGS
2341
2342 test A2, A2
2343 jz .div_zero
2344 %if %4 == 0
2345 cmp [A1], A2
2346 jae .div_overflow
2347 %else
2348 push A2 ; save A2 so we modify it (we out of regs on x86).
2349 mov T0, [A0] ; T0 = dividend low
2350 mov T1, [A1] ; T1 = dividend high
2351 test A2, A2
2352 js .divisor_negative
2353 test T1, T1
2354 jns .both_positive
2355 call NAME(iemAImpl_negate_T0_T1_u64)
2356.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2357 push T0 ; Start off like unsigned below.
2358 shl T1, 1
2359 shr T0, 63
2360 or T1, T0
2361 cmp T1, A2
2362 pop T0
2363 jb .div_no_overflow
2364 ja .div_overflow
2365 mov T1, 0x7fffffffffffffff
2366 and T0, T1 ; Special case for covering (divisor - 1).
2367 cmp T0, A2
2368 jae .div_overflow
2369 jmp .div_no_overflow
2370
2371.divisor_negative:
2372 neg A2
2373 test T1, T1
2374 jns .one_of_each
2375 call NAME(iemAImpl_negate_T0_T1_u64)
2376.both_positive: ; Same as unsigned shifted by sign indicator bit.
2377 shl T1, 1
2378 shr T0, 63
2379 or T1, T0
2380 cmp T1, A2
2381 jae .div_overflow
2382.div_no_overflow:
2383 pop A2
2384 %endif
2385
2386 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2387 mov rax, [A0]
2388 %ifdef ASM_CALL64_GCC
2389 mov T1, A2
2390 mov rax, [A0]
2391 mov rdx, [A1]
2392 %1 T1
2393 mov [A0], rax
2394 mov [A1], rdx
2395 %else
2396 mov T1, A1
2397 mov rax, [A0]
2398 mov rdx, [T1]
2399 %1 A2
2400 mov [A0], rax
2401 mov [T1], rdx
2402 %endif
2403 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2404 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2405 %else
2406 IEM_SAVE_FLAGS A3, %2, %3
2407 %endif
2408 xor eax, eax
2409
2410.return:
2411 EPILOGUE_4_ARGS_EX 12
2412
2413.div_overflow:
2414 %if %4 != 0
2415 pop A2
2416 %endif
2417.div_zero:
2418 mov eax, -1
2419 jmp .return
2420ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2421 %endif ; !RT_ARCH_AMD64
2422
2423%endmacro
2424
2425IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2426IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2427IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2428IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2429IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2430IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2431
2432
2433;;
2434; Macro for implementing memory fence operation.
2435;
2436; No return value, no operands or anything.
2437;
2438; @param 1 The instruction.
2439;
2440%macro IEMIMPL_MEM_FENCE 1
2441BEGINCODE
2442BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2443 %1
2444 ret
2445ENDPROC iemAImpl_ %+ %1
2446%endmacro
2447
2448IEMIMPL_MEM_FENCE lfence
2449IEMIMPL_MEM_FENCE sfence
2450IEMIMPL_MEM_FENCE mfence
2451
2452;;
2453; Alternative for non-SSE2 host.
2454;
2455BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2456 push xAX
2457 xchg xAX, [xSP]
2458 add xSP, xCB
2459 ret
2460ENDPROC iemAImpl_alt_mem_fence
2461
2462
2463;;
2464; Initialize the FPU for the actual instruction being emulated, this means
2465; loading parts of the guest's control word and status word.
2466;
2467; @uses 24 bytes of stack. T0, T1
2468; @param 1 Expression giving the address of the FXSTATE of the guest.
2469;
2470%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2471 fnstenv [xSP]
2472
2473 ; FCW - for exception, precision and rounding control.
2474 movzx T0, word [%1 + X86FXSTATE.FCW]
2475 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2476 mov [xSP + X86FSTENV32P.FCW], T0_16
2477
2478 ; FSW - for undefined C0, C1, C2, and C3.
2479 movzx T1, word [%1 + X86FXSTATE.FSW]
2480 and T1, X86_FSW_C_MASK
2481 movzx T0, word [xSP + X86FSTENV32P.FSW]
2482 and T0, X86_FSW_TOP_MASK
2483 or T0, T1
2484 mov [xSP + X86FSTENV32P.FSW], T0_16
2485
2486 fldenv [xSP]
2487%endmacro
2488
2489
2490;;
2491; Initialize the FPU for the actual instruction being emulated, this means
2492; loading parts of the guest's control word, status word, and update the
2493; tag word for the top register if it's empty.
2494;
2495; ASSUMES actual TOP=7
2496;
2497; @uses 24 bytes of stack. T0, T1
2498; @param 1 Expression giving the address of the FXSTATE of the guest.
2499;
2500%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2501 fnstenv [xSP]
2502
2503 ; FCW - for exception, precision and rounding control.
2504 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2505 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2506 mov [xSP + X86FSTENV32P.FCW], T0_16
2507
2508 ; FSW - for undefined C0, C1, C2, and C3.
2509 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2510 and T1_32, X86_FSW_C_MASK
2511 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2512 and T0_32, X86_FSW_TOP_MASK
2513 or T0_32, T1_32
2514 mov [xSP + X86FSTENV32P.FSW], T0_16
2515
2516 ; FTW - Only for ST0 (in/out).
2517 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2518 shr T1_32, X86_FSW_TOP_SHIFT
2519 and T1_32, X86_FSW_TOP_SMASK
2520 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2521 jc %%st0_not_empty
2522 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2523%%st0_not_empty:
2524
2525 fldenv [xSP]
2526%endmacro
2527
2528
2529;;
2530; Need to move this as well somewhere better?
2531;
2532struc IEMFPURESULT
2533 .r80Result resw 5
2534 .FSW resw 1
2535endstruc
2536
2537
2538;;
2539; Need to move this as well somewhere better?
2540;
2541struc IEMFPURESULTTWO
2542 .r80Result1 resw 5
2543 .FSW resw 1
2544 .r80Result2 resw 5
2545endstruc
2546
2547
2548;
2549;---------------------- 16-bit signed integer operations ----------------------
2550;
2551
2552
2553;;
2554; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2555;
2556; @param A0 FPU context (fxsave).
2557; @param A1 Pointer to a IEMFPURESULT for the output.
2558; @param A2 Pointer to the 16-bit floating point value to convert.
2559;
2560BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2561 PROLOGUE_3_ARGS
2562 sub xSP, 20h
2563
2564 fninit
2565 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2566 fild word [A2]
2567
2568 fnstsw word [A1 + IEMFPURESULT.FSW]
2569 fnclex
2570 fstp tword [A1 + IEMFPURESULT.r80Result]
2571
2572 fninit
2573 add xSP, 20h
2574 EPILOGUE_3_ARGS
2575ENDPROC iemAImpl_fild_r80_from_i16
2576
2577
2578;;
2579; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2580;
2581; @param A0 FPU context (fxsave).
2582; @param A1 Where to return the output FSW.
2583; @param A2 Where to store the 16-bit signed integer value.
2584; @param A3 Pointer to the 80-bit value.
2585;
2586BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2587 PROLOGUE_4_ARGS
2588 sub xSP, 20h
2589
2590 fninit
2591 fld tword [A3]
2592 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2593 fistp word [A2]
2594
2595 fnstsw word [A1]
2596
2597 fninit
2598 add xSP, 20h
2599 EPILOGUE_4_ARGS
2600ENDPROC iemAImpl_fist_r80_to_i16
2601
2602
2603;;
2604; Store a 80-bit floating point value (register) as a 16-bit signed integer
2605; (memory) with truncation.
2606;
2607; @param A0 FPU context (fxsave).
2608; @param A1 Where to return the output FSW.
2609; @param A2 Where to store the 16-bit signed integer value.
2610; @param A3 Pointer to the 80-bit value.
2611;
2612BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2613 PROLOGUE_4_ARGS
2614 sub xSP, 20h
2615
2616 fninit
2617 fld tword [A3]
2618 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2619 fisttp word [A2]
2620
2621 fnstsw word [A1]
2622
2623 fninit
2624 add xSP, 20h
2625 EPILOGUE_4_ARGS
2626ENDPROC iemAImpl_fistt_r80_to_i16
2627
2628
2629;;
2630; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2631;
2632; @param 1 The instruction
2633;
2634; @param A0 FPU context (fxsave).
2635; @param A1 Pointer to a IEMFPURESULT for the output.
2636; @param A2 Pointer to the 80-bit value.
2637; @param A3 Pointer to the 16-bit value.
2638;
2639%macro IEMIMPL_FPU_R80_BY_I16 1
2640BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2641 PROLOGUE_4_ARGS
2642 sub xSP, 20h
2643
2644 fninit
2645 fld tword [A2]
2646 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2647 %1 word [A3]
2648
2649 fnstsw word [A1 + IEMFPURESULT.FSW]
2650 fnclex
2651 fstp tword [A1 + IEMFPURESULT.r80Result]
2652
2653 fninit
2654 add xSP, 20h
2655 EPILOGUE_4_ARGS
2656ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2657%endmacro
2658
2659IEMIMPL_FPU_R80_BY_I16 fiadd
2660IEMIMPL_FPU_R80_BY_I16 fimul
2661IEMIMPL_FPU_R80_BY_I16 fisub
2662IEMIMPL_FPU_R80_BY_I16 fisubr
2663IEMIMPL_FPU_R80_BY_I16 fidiv
2664IEMIMPL_FPU_R80_BY_I16 fidivr
2665
2666
2667;;
2668; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2669; only returning FSW.
2670;
2671; @param 1 The instruction
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Where to store the output FSW.
2675; @param A2 Pointer to the 80-bit value.
2676; @param A3 Pointer to the 64-bit value.
2677;
2678%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2679BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2680 PROLOGUE_4_ARGS
2681 sub xSP, 20h
2682
2683 fninit
2684 fld tword [A2]
2685 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2686 %1 word [A3]
2687
2688 fnstsw word [A1]
2689
2690 fninit
2691 add xSP, 20h
2692 EPILOGUE_4_ARGS
2693ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2694%endmacro
2695
2696IEMIMPL_FPU_R80_BY_I16_FSW ficom
2697
2698
2699
2700;
2701;---------------------- 32-bit signed integer operations ----------------------
2702;
2703
2704
2705;;
2706; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2707;
2708; @param A0 FPU context (fxsave).
2709; @param A1 Pointer to a IEMFPURESULT for the output.
2710; @param A2 Pointer to the 32-bit floating point value to convert.
2711;
2712BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2713 PROLOGUE_3_ARGS
2714 sub xSP, 20h
2715
2716 fninit
2717 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2718 fild dword [A2]
2719
2720 fnstsw word [A1 + IEMFPURESULT.FSW]
2721 fnclex
2722 fstp tword [A1 + IEMFPURESULT.r80Result]
2723
2724 fninit
2725 add xSP, 20h
2726 EPILOGUE_3_ARGS
2727ENDPROC iemAImpl_fild_r80_from_i32
2728
2729
2730;;
2731; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2732;
2733; @param A0 FPU context (fxsave).
2734; @param A1 Where to return the output FSW.
2735; @param A2 Where to store the 32-bit signed integer value.
2736; @param A3 Pointer to the 80-bit value.
2737;
2738BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2739 PROLOGUE_4_ARGS
2740 sub xSP, 20h
2741
2742 fninit
2743 fld tword [A3]
2744 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2745 fistp dword [A2]
2746
2747 fnstsw word [A1]
2748
2749 fninit
2750 add xSP, 20h
2751 EPILOGUE_4_ARGS
2752ENDPROC iemAImpl_fist_r80_to_i32
2753
2754
2755;;
2756; Store a 80-bit floating point value (register) as a 32-bit signed integer
2757; (memory) with truncation.
2758;
2759; @param A0 FPU context (fxsave).
2760; @param A1 Where to return the output FSW.
2761; @param A2 Where to store the 32-bit signed integer value.
2762; @param A3 Pointer to the 80-bit value.
2763;
2764BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2765 PROLOGUE_4_ARGS
2766 sub xSP, 20h
2767
2768 fninit
2769 fld tword [A3]
2770 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2771 fisttp dword [A2]
2772
2773 fnstsw word [A1]
2774
2775 fninit
2776 add xSP, 20h
2777 EPILOGUE_4_ARGS
2778ENDPROC iemAImpl_fistt_r80_to_i32
2779
2780
2781;;
2782; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2783;
2784; @param 1 The instruction
2785;
2786; @param A0 FPU context (fxsave).
2787; @param A1 Pointer to a IEMFPURESULT for the output.
2788; @param A2 Pointer to the 80-bit value.
2789; @param A3 Pointer to the 32-bit value.
2790;
2791%macro IEMIMPL_FPU_R80_BY_I32 1
2792BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2793 PROLOGUE_4_ARGS
2794 sub xSP, 20h
2795
2796 fninit
2797 fld tword [A2]
2798 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2799 %1 dword [A3]
2800
2801 fnstsw word [A1 + IEMFPURESULT.FSW]
2802 fnclex
2803 fstp tword [A1 + IEMFPURESULT.r80Result]
2804
2805 fninit
2806 add xSP, 20h
2807 EPILOGUE_4_ARGS
2808ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2809%endmacro
2810
2811IEMIMPL_FPU_R80_BY_I32 fiadd
2812IEMIMPL_FPU_R80_BY_I32 fimul
2813IEMIMPL_FPU_R80_BY_I32 fisub
2814IEMIMPL_FPU_R80_BY_I32 fisubr
2815IEMIMPL_FPU_R80_BY_I32 fidiv
2816IEMIMPL_FPU_R80_BY_I32 fidivr
2817
2818
2819;;
2820; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2821; only returning FSW.
2822;
2823; @param 1 The instruction
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Where to store the output FSW.
2827; @param A2 Pointer to the 80-bit value.
2828; @param A3 Pointer to the 64-bit value.
2829;
2830%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2831BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2832 PROLOGUE_4_ARGS
2833 sub xSP, 20h
2834
2835 fninit
2836 fld tword [A2]
2837 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2838 %1 dword [A3]
2839
2840 fnstsw word [A1]
2841
2842 fninit
2843 add xSP, 20h
2844 EPILOGUE_4_ARGS
2845ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2846%endmacro
2847
2848IEMIMPL_FPU_R80_BY_I32_FSW ficom
2849
2850
2851
2852;
2853;---------------------- 64-bit signed integer operations ----------------------
2854;
2855
2856
2857;;
2858; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2859;
2860; @param A0 FPU context (fxsave).
2861; @param A1 Pointer to a IEMFPURESULT for the output.
2862; @param A2 Pointer to the 64-bit floating point value to convert.
2863;
2864BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2865 PROLOGUE_3_ARGS
2866 sub xSP, 20h
2867
2868 fninit
2869 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2870 fild qword [A2]
2871
2872 fnstsw word [A1 + IEMFPURESULT.FSW]
2873 fnclex
2874 fstp tword [A1 + IEMFPURESULT.r80Result]
2875
2876 fninit
2877 add xSP, 20h
2878 EPILOGUE_3_ARGS
2879ENDPROC iemAImpl_fild_r80_from_i64
2880
2881
2882;;
2883; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2884;
2885; @param A0 FPU context (fxsave).
2886; @param A1 Where to return the output FSW.
2887; @param A2 Where to store the 64-bit signed integer value.
2888; @param A3 Pointer to the 80-bit value.
2889;
2890BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2891 PROLOGUE_4_ARGS
2892 sub xSP, 20h
2893
2894 fninit
2895 fld tword [A3]
2896 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2897 fistp qword [A2]
2898
2899 fnstsw word [A1]
2900
2901 fninit
2902 add xSP, 20h
2903 EPILOGUE_4_ARGS
2904ENDPROC iemAImpl_fist_r80_to_i64
2905
2906
2907;;
2908; Store a 80-bit floating point value (register) as a 64-bit signed integer
2909; (memory) with truncation.
2910;
2911; @param A0 FPU context (fxsave).
2912; @param A1 Where to return the output FSW.
2913; @param A2 Where to store the 64-bit signed integer value.
2914; @param A3 Pointer to the 80-bit value.
2915;
2916BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2917 PROLOGUE_4_ARGS
2918 sub xSP, 20h
2919
2920 fninit
2921 fld tword [A3]
2922 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2923 fisttp qword [A2]
2924
2925 fnstsw word [A1]
2926
2927 fninit
2928 add xSP, 20h
2929 EPILOGUE_4_ARGS
2930ENDPROC iemAImpl_fistt_r80_to_i64
2931
2932
2933
2934;
2935;---------------------- 32-bit floating point operations ----------------------
2936;
2937
2938;;
2939; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2940;
2941; @param A0 FPU context (fxsave).
2942; @param A1 Pointer to a IEMFPURESULT for the output.
2943; @param A2 Pointer to the 32-bit floating point value to convert.
2944;
2945BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2946 PROLOGUE_3_ARGS
2947 sub xSP, 20h
2948
2949 fninit
2950 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2951 fld dword [A2]
2952
2953 fnstsw word [A1 + IEMFPURESULT.FSW]
2954 fnclex
2955 fstp tword [A1 + IEMFPURESULT.r80Result]
2956
2957 fninit
2958 add xSP, 20h
2959 EPILOGUE_3_ARGS
2960ENDPROC iemAImpl_fld_r80_from_r32
2961
2962
2963;;
2964; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2965;
2966; @param A0 FPU context (fxsave).
2967; @param A1 Where to return the output FSW.
2968; @param A2 Where to store the 32-bit value.
2969; @param A3 Pointer to the 80-bit value.
2970;
2971BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2972 PROLOGUE_4_ARGS
2973 sub xSP, 20h
2974
2975 fninit
2976 fld tword [A3]
2977 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2978 fst dword [A2]
2979
2980 fnstsw word [A1]
2981
2982 fninit
2983 add xSP, 20h
2984 EPILOGUE_4_ARGS
2985ENDPROC iemAImpl_fst_r80_to_r32
2986
2987
2988;;
2989; FPU instruction working on one 80-bit and one 32-bit floating point value.
2990;
2991; @param 1 The instruction
2992;
2993; @param A0 FPU context (fxsave).
2994; @param A1 Pointer to a IEMFPURESULT for the output.
2995; @param A2 Pointer to the 80-bit value.
2996; @param A3 Pointer to the 32-bit value.
2997;
2998%macro IEMIMPL_FPU_R80_BY_R32 1
2999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3000 PROLOGUE_4_ARGS
3001 sub xSP, 20h
3002
3003 fninit
3004 fld tword [A2]
3005 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3006 %1 dword [A3]
3007
3008 fnstsw word [A1 + IEMFPURESULT.FSW]
3009 fnclex
3010 fstp tword [A1 + IEMFPURESULT.r80Result]
3011
3012 fninit
3013 add xSP, 20h
3014 EPILOGUE_4_ARGS
3015ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3016%endmacro
3017
3018IEMIMPL_FPU_R80_BY_R32 fadd
3019IEMIMPL_FPU_R80_BY_R32 fmul
3020IEMIMPL_FPU_R80_BY_R32 fsub
3021IEMIMPL_FPU_R80_BY_R32 fsubr
3022IEMIMPL_FPU_R80_BY_R32 fdiv
3023IEMIMPL_FPU_R80_BY_R32 fdivr
3024
3025
3026;;
3027; FPU instruction working on one 80-bit and one 32-bit floating point value,
3028; only returning FSW.
3029;
3030; @param 1 The instruction
3031;
3032; @param A0 FPU context (fxsave).
3033; @param A1 Where to store the output FSW.
3034; @param A2 Pointer to the 80-bit value.
3035; @param A3 Pointer to the 64-bit value.
3036;
3037%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3039 PROLOGUE_4_ARGS
3040 sub xSP, 20h
3041
3042 fninit
3043 fld tword [A2]
3044 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3045 %1 dword [A3]
3046
3047 fnstsw word [A1]
3048
3049 fninit
3050 add xSP, 20h
3051 EPILOGUE_4_ARGS
3052ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3053%endmacro
3054
3055IEMIMPL_FPU_R80_BY_R32_FSW fcom
3056
3057
3058
3059;
3060;---------------------- 64-bit floating point operations ----------------------
3061;
3062
3063;;
3064; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3065;
3066; @param A0 FPU context (fxsave).
3067; @param A1 Pointer to a IEMFPURESULT for the output.
3068; @param A2 Pointer to the 64-bit floating point value to convert.
3069;
3070BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3071 PROLOGUE_3_ARGS
3072 sub xSP, 20h
3073
3074 fninit
3075 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3076 fld qword [A2]
3077
3078 fnstsw word [A1 + IEMFPURESULT.FSW]
3079 fnclex
3080 fstp tword [A1 + IEMFPURESULT.r80Result]
3081
3082 fninit
3083 add xSP, 20h
3084 EPILOGUE_3_ARGS
3085ENDPROC iemAImpl_fld_r80_from_r64
3086
3087
3088;;
3089; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3090;
3091; @param A0 FPU context (fxsave).
3092; @param A1 Where to return the output FSW.
3093; @param A2 Where to store the 64-bit value.
3094; @param A3 Pointer to the 80-bit value.
3095;
3096BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3097 PROLOGUE_4_ARGS
3098 sub xSP, 20h
3099
3100 fninit
3101 fld tword [A3]
3102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3103 fst qword [A2]
3104
3105 fnstsw word [A1]
3106
3107 fninit
3108 add xSP, 20h
3109 EPILOGUE_4_ARGS
3110ENDPROC iemAImpl_fst_r80_to_r64
3111
3112
3113;;
3114; FPU instruction working on one 80-bit and one 64-bit floating point value.
3115;
3116; @param 1 The instruction
3117;
3118; @param A0 FPU context (fxsave).
3119; @param A1 Pointer to a IEMFPURESULT for the output.
3120; @param A2 Pointer to the 80-bit value.
3121; @param A3 Pointer to the 64-bit value.
3122;
3123%macro IEMIMPL_FPU_R80_BY_R64 1
3124BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3125 PROLOGUE_4_ARGS
3126 sub xSP, 20h
3127
3128 fninit
3129 fld tword [A2]
3130 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3131 %1 qword [A3]
3132
3133 fnstsw word [A1 + IEMFPURESULT.FSW]
3134 fnclex
3135 fstp tword [A1 + IEMFPURESULT.r80Result]
3136
3137 fninit
3138 add xSP, 20h
3139 EPILOGUE_4_ARGS
3140ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3141%endmacro
3142
3143IEMIMPL_FPU_R80_BY_R64 fadd
3144IEMIMPL_FPU_R80_BY_R64 fmul
3145IEMIMPL_FPU_R80_BY_R64 fsub
3146IEMIMPL_FPU_R80_BY_R64 fsubr
3147IEMIMPL_FPU_R80_BY_R64 fdiv
3148IEMIMPL_FPU_R80_BY_R64 fdivr
3149
3150;;
3151; FPU instruction working on one 80-bit and one 64-bit floating point value,
3152; only returning FSW.
3153;
3154; @param 1 The instruction
3155;
3156; @param A0 FPU context (fxsave).
3157; @param A1 Where to store the output FSW.
3158; @param A2 Pointer to the 80-bit value.
3159; @param A3 Pointer to the 64-bit value.
3160;
3161%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3162BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3163 PROLOGUE_4_ARGS
3164 sub xSP, 20h
3165
3166 fninit
3167 fld tword [A2]
3168 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3169 %1 qword [A3]
3170
3171 fnstsw word [A1]
3172
3173 fninit
3174 add xSP, 20h
3175 EPILOGUE_4_ARGS
3176ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3177%endmacro
3178
3179IEMIMPL_FPU_R80_BY_R64_FSW fcom
3180
3181
3182
3183;
3184;---------------------- 80-bit floating point operations ----------------------
3185;
3186
3187;;
3188; Loads a 80-bit floating point register value from memory.
3189;
3190; @param A0 FPU context (fxsave).
3191; @param A1 Pointer to a IEMFPURESULT for the output.
3192; @param A2 Pointer to the 80-bit floating point value to load.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3195 PROLOGUE_3_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3200 fld tword [A2]
3201
3202 fnstsw word [A1 + IEMFPURESULT.FSW]
3203 fnclex
3204 fstp tword [A1 + IEMFPURESULT.r80Result]
3205
3206 fninit
3207 add xSP, 20h
3208 EPILOGUE_3_ARGS
3209ENDPROC iemAImpl_fld_r80_from_r80
3210
3211
3212;;
3213; Store a 80-bit floating point register to memory
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 80-bit value.
3218; @param A3 Pointer to the 80-bit register value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fstp tword [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fst_r80_to_r80
3235
3236
3237;;
3238; Loads an 80-bit floating point register value in BCD format from memory.
3239;
3240; @param A0 FPU context (fxsave).
3241; @param A1 Pointer to a IEMFPURESULT for the output.
3242; @param A2 Pointer to the 80-bit BCD value to load.
3243;
3244BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3245 PROLOGUE_3_ARGS
3246 sub xSP, 20h
3247
3248 fninit
3249 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3250 fbld tword [A2]
3251
3252 fnstsw word [A1 + IEMFPURESULT.FSW]
3253 fnclex
3254 fstp tword [A1 + IEMFPURESULT.r80Result]
3255
3256 fninit
3257 add xSP, 20h
3258 EPILOGUE_3_ARGS
3259ENDPROC iemAImpl_fld_r80_from_d80
3260
3261
3262;;
3263; Store a 80-bit floating point register to memory as BCD
3264;
3265; @param A0 FPU context (fxsave).
3266; @param A1 Where to return the output FSW.
3267; @param A2 Where to store the 80-bit BCD value.
3268; @param A3 Pointer to the 80-bit register value.
3269;
3270BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3271 PROLOGUE_4_ARGS
3272 sub xSP, 20h
3273
3274 fninit
3275 fld tword [A3]
3276 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3277 fbstp tword [A2]
3278
3279 fnstsw word [A1]
3280
3281 fninit
3282 add xSP, 20h
3283 EPILOGUE_4_ARGS
3284ENDPROC iemAImpl_fst_r80_to_d80
3285
3286
3287;;
3288; FPU instruction working on two 80-bit floating point values.
3289;
3290; @param 1 The instruction
3291;
3292; @param A0 FPU context (fxsave).
3293; @param A1 Pointer to a IEMFPURESULT for the output.
3294; @param A2 Pointer to the first 80-bit value (ST0)
3295; @param A3 Pointer to the second 80-bit value (STn).
3296;
3297%macro IEMIMPL_FPU_R80_BY_R80 2
3298BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3299 PROLOGUE_4_ARGS
3300 sub xSP, 20h
3301
3302 fninit
3303 fld tword [A3]
3304 fld tword [A2]
3305 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3306 %1 %2
3307
3308 fnstsw word [A1 + IEMFPURESULT.FSW]
3309 fnclex
3310 fstp tword [A1 + IEMFPURESULT.r80Result]
3311
3312 fninit
3313 add xSP, 20h
3314 EPILOGUE_4_ARGS
3315ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3316%endmacro
3317
3318IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3319IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3320IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3321IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3322IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3323IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3324IEMIMPL_FPU_R80_BY_R80 fprem, {}
3325IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3326IEMIMPL_FPU_R80_BY_R80 fscale, {}
3327
3328
3329;;
3330; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3331; storing the result in ST1 and popping the stack.
3332;
3333; @param 1 The instruction
3334;
3335; @param A0 FPU context (fxsave).
3336; @param A1 Pointer to a IEMFPURESULT for the output.
3337; @param A2 Pointer to the first 80-bit value (ST1).
3338; @param A3 Pointer to the second 80-bit value (ST0).
3339;
3340%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3341BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3342 PROLOGUE_4_ARGS
3343 sub xSP, 20h
3344
3345 fninit
3346 fld tword [A2]
3347 fld tword [A3]
3348 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3349 %1
3350
3351 fnstsw word [A1 + IEMFPURESULT.FSW]
3352 fnclex
3353 fstp tword [A1 + IEMFPURESULT.r80Result]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3362IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3363IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3364
3365
3366;;
3367; FPU instruction working on two 80-bit floating point values, only
3368; returning FSW.
3369;
3370; @param 1 The instruction
3371;
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st0, st1
3387
3388 fnstsw word [A1]
3389
3390 fninit
3391 add xSP, 20h
3392 EPILOGUE_4_ARGS
3393ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3394%endmacro
3395
3396IEMIMPL_FPU_R80_BY_R80_FSW fcom
3397IEMIMPL_FPU_R80_BY_R80_FSW fucom
3398
3399
3400;;
3401; FPU instruction working on two 80-bit floating point values,
3402; returning FSW and EFLAGS (eax).
3403;
3404; @param 1 The instruction
3405;
3406; @returns EFLAGS in EAX.
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a uint16_t for the resulting FSW.
3409; @param A2 Pointer to the first 80-bit value.
3410; @param A3 Pointer to the second 80-bit value.
3411;
3412%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3413BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3414 PROLOGUE_4_ARGS
3415 sub xSP, 20h
3416
3417 fninit
3418 fld tword [A3]
3419 fld tword [A2]
3420 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3421 %1 st1
3422
3423 fnstsw word [A1]
3424 pushf
3425 pop xAX
3426
3427 fninit
3428 add xSP, 20h
3429 EPILOGUE_4_ARGS
3430ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3431%endmacro
3432
3433IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3434IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3435
3436
3437;;
3438; FPU instruction working on one 80-bit floating point value.
3439;
3440; @param 1 The instruction
3441;
3442; @param A0 FPU context (fxsave).
3443; @param A1 Pointer to a IEMFPURESULT for the output.
3444; @param A2 Pointer to the 80-bit value.
3445;
3446%macro IEMIMPL_FPU_R80 1
3447BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3448 PROLOGUE_3_ARGS
3449 sub xSP, 20h
3450
3451 fninit
3452 fld tword [A2]
3453 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3454 %1
3455
3456 fnstsw word [A1 + IEMFPURESULT.FSW]
3457 fnclex
3458 fstp tword [A1 + IEMFPURESULT.r80Result]
3459
3460 fninit
3461 add xSP, 20h
3462 EPILOGUE_3_ARGS
3463ENDPROC iemAImpl_ %+ %1 %+ _r80
3464%endmacro
3465
3466IEMIMPL_FPU_R80 fchs
3467IEMIMPL_FPU_R80 fabs
3468IEMIMPL_FPU_R80 f2xm1
3469IEMIMPL_FPU_R80 fsqrt
3470IEMIMPL_FPU_R80 frndint
3471IEMIMPL_FPU_R80 fsin
3472IEMIMPL_FPU_R80 fcos
3473
3474
3475;;
3476; FPU instruction working on one 80-bit floating point value, only
3477; returning FSW.
3478;
3479; @param 1 The instruction
3480; @param 2 Non-zero to also restore FTW.
3481;
3482; @param A0 FPU context (fxsave).
3483; @param A1 Pointer to a uint16_t for the resulting FSW.
3484; @param A2 Pointer to the 80-bit value.
3485;
3486%macro IEMIMPL_FPU_R80_FSW 2
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3488 PROLOGUE_3_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 fld tword [A2]
3493%if %2 != 0
3494 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3495%else
3496 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3497%endif
3498 %1
3499
3500 fnstsw word [A1]
3501
3502 fninit
3503 add xSP, 20h
3504 EPILOGUE_3_ARGS
3505ENDPROC iemAImpl_ %+ %1 %+ _r80
3506%endmacro
3507
3508IEMIMPL_FPU_R80_FSW ftst, 0
3509IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3510
3511
3512
3513;;
3514; FPU instruction loading a 80-bit floating point constant.
3515;
3516; @param 1 The instruction
3517;
3518; @param A0 FPU context (fxsave).
3519; @param A1 Pointer to a IEMFPURESULT for the output.
3520;
3521%macro IEMIMPL_FPU_R80_CONST 1
3522BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3523 PROLOGUE_2_ARGS
3524 sub xSP, 20h
3525
3526 fninit
3527 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3528 %1
3529
3530 fnstsw word [A1 + IEMFPURESULT.FSW]
3531 fnclex
3532 fstp tword [A1 + IEMFPURESULT.r80Result]
3533
3534 fninit
3535 add xSP, 20h
3536 EPILOGUE_2_ARGS
3537ENDPROC iemAImpl_ %+ %1 %+
3538%endmacro
3539
3540IEMIMPL_FPU_R80_CONST fld1
3541IEMIMPL_FPU_R80_CONST fldl2t
3542IEMIMPL_FPU_R80_CONST fldl2e
3543IEMIMPL_FPU_R80_CONST fldpi
3544IEMIMPL_FPU_R80_CONST fldlg2
3545IEMIMPL_FPU_R80_CONST fldln2
3546IEMIMPL_FPU_R80_CONST fldz
3547
3548
3549;;
3550; FPU instruction working on one 80-bit floating point value, outputing two.
3551;
3552; @param 1 The instruction
3553;
3554; @param A0 FPU context (fxsave).
3555; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3556; @param A2 Pointer to the 80-bit value.
3557;
3558%macro IEMIMPL_FPU_R80_R80 1
3559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3560 PROLOGUE_3_ARGS
3561 sub xSP, 20h
3562
3563 fninit
3564 fld tword [A2]
3565 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3566 %1
3567
3568 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3569 fnclex
3570 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3571 fnclex
3572 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3573
3574 fninit
3575 add xSP, 20h
3576 EPILOGUE_3_ARGS
3577ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3578%endmacro
3579
3580IEMIMPL_FPU_R80_R80 fptan
3581IEMIMPL_FPU_R80_R80 fxtract
3582IEMIMPL_FPU_R80_R80 fsincos
3583
3584
3585
3586
3587;---------------------- SSE and MMX Operations ----------------------
3588
3589;; @todo what do we need to do for MMX?
3590%macro IEMIMPL_MMX_PROLOGUE 0
3591%endmacro
3592%macro IEMIMPL_MMX_EPILOGUE 0
3593%endmacro
3594
3595;; @todo what do we need to do for SSE?
3596%macro IEMIMPL_SSE_PROLOGUE 0
3597%endmacro
3598%macro IEMIMPL_SSE_EPILOGUE 0
3599%endmacro
3600
3601;; @todo what do we need to do for AVX?
3602%macro IEMIMPL_AVX_PROLOGUE 0
3603%endmacro
3604%macro IEMIMPL_AVX_EPILOGUE 0
3605%endmacro
3606
3607
3608;;
3609; Media instruction working on two full sized registers.
3610;
3611; @param 1 The instruction
3612; @param 2 Whether there is an MMX variant (1) or not (0).
3613;
3614; @param A0 FPU context (fxsave).
3615; @param A1 Pointer to the first media register size operand (input/output).
3616; @param A2 Pointer to the second media register size operand (input).
3617;
3618%macro IEMIMPL_MEDIA_F2 2
3619%if %2 != 0
3620BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3621 PROLOGUE_3_ARGS
3622 IEMIMPL_MMX_PROLOGUE
3623
3624 movq mm0, [A1]
3625 movq mm1, [A2]
3626 %1 mm0, mm1
3627 movq [A1], mm0
3628
3629 IEMIMPL_MMX_EPILOGUE
3630 EPILOGUE_3_ARGS
3631ENDPROC iemAImpl_ %+ %1 %+ _u64
3632%endif
3633
3634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3635 PROLOGUE_3_ARGS
3636 IEMIMPL_SSE_PROLOGUE
3637
3638 movdqu xmm0, [A1]
3639 movdqu xmm1, [A2]
3640 %1 xmm0, xmm1
3641 movdqu [A1], xmm0
3642
3643 IEMIMPL_SSE_EPILOGUE
3644 EPILOGUE_3_ARGS
3645ENDPROC iemAImpl_ %+ %1 %+ _u128
3646%endmacro
3647
3648IEMIMPL_MEDIA_F2 pshufb, 1
3649IEMIMPL_MEDIA_F2 pand, 1
3650IEMIMPL_MEDIA_F2 pandn, 1
3651IEMIMPL_MEDIA_F2 por, 1
3652IEMIMPL_MEDIA_F2 pxor, 1
3653IEMIMPL_MEDIA_F2 pcmpeqb, 1
3654IEMIMPL_MEDIA_F2 pcmpeqw, 1
3655IEMIMPL_MEDIA_F2 pcmpeqd, 1
3656IEMIMPL_MEDIA_F2 pcmpeqq, 0
3657IEMIMPL_MEDIA_F2 pcmpgtb, 1
3658IEMIMPL_MEDIA_F2 pcmpgtw, 1
3659IEMIMPL_MEDIA_F2 pcmpgtd, 1
3660IEMIMPL_MEDIA_F2 pcmpgtq, 0
3661IEMIMPL_MEDIA_F2 paddb, 1
3662IEMIMPL_MEDIA_F2 paddw, 1
3663IEMIMPL_MEDIA_F2 paddd, 1
3664IEMIMPL_MEDIA_F2 paddq, 1
3665IEMIMPL_MEDIA_F2 paddsb, 1
3666IEMIMPL_MEDIA_F2 paddsw, 1
3667IEMIMPL_MEDIA_F2 paddusb, 1
3668IEMIMPL_MEDIA_F2 paddusw, 1
3669IEMIMPL_MEDIA_F2 psubb, 1
3670IEMIMPL_MEDIA_F2 psubw, 1
3671IEMIMPL_MEDIA_F2 psubd, 1
3672IEMIMPL_MEDIA_F2 psubq, 1
3673IEMIMPL_MEDIA_F2 psubsb, 1
3674IEMIMPL_MEDIA_F2 psubsw, 1
3675IEMIMPL_MEDIA_F2 psubusb, 1
3676IEMIMPL_MEDIA_F2 psubusw, 1
3677IEMIMPL_MEDIA_F2 pmullw, 1
3678IEMIMPL_MEDIA_F2 pmulld, 0
3679IEMIMPL_MEDIA_F2 pmulhw, 1
3680IEMIMPL_MEDIA_F2 pmaddwd, 1
3681IEMIMPL_MEDIA_F2 pminub, 1
3682IEMIMPL_MEDIA_F2 pminuw, 0
3683IEMIMPL_MEDIA_F2 pminud, 0
3684IEMIMPL_MEDIA_F2 pminsb, 0
3685IEMIMPL_MEDIA_F2 pminsw, 1
3686IEMIMPL_MEDIA_F2 pminsd, 0
3687IEMIMPL_MEDIA_F2 pmaxub, 1
3688IEMIMPL_MEDIA_F2 pmaxuw, 0
3689IEMIMPL_MEDIA_F2 pmaxud, 0
3690IEMIMPL_MEDIA_F2 pmaxsb, 0
3691IEMIMPL_MEDIA_F2 pmaxsw, 1
3692IEMIMPL_MEDIA_F2 pmaxsd, 0
3693IEMIMPL_MEDIA_F2 pabsb, 1
3694IEMIMPL_MEDIA_F2 pabsw, 1
3695IEMIMPL_MEDIA_F2 pabsd, 1
3696IEMIMPL_MEDIA_F2 psignb, 1
3697IEMIMPL_MEDIA_F2 psignw, 1
3698IEMIMPL_MEDIA_F2 psignd, 1
3699IEMIMPL_MEDIA_F2 phaddw, 1
3700IEMIMPL_MEDIA_F2 phaddd, 1
3701IEMIMPL_MEDIA_F2 phsubw, 1
3702IEMIMPL_MEDIA_F2 phsubd, 1
3703IEMIMPL_MEDIA_F2 phaddsw, 1
3704IEMIMPL_MEDIA_F2 phsubsw, 1
3705IEMIMPL_MEDIA_F2 pmaddubsw, 1
3706IEMIMPL_MEDIA_F2 pmulhrsw, 1
3707IEMIMPL_MEDIA_F2 pmuludq, 1
3708
3709
3710;;
3711; Media instruction working on two full sized registers, but no FXSAVE state argument.
3712;
3713; @param 1 The instruction
3714; @param 2 Whether there is an MMX variant (1) or not (0).
3715;
3716; @param A0 Pointer to the first media register size operand (input/output).
3717; @param A1 Pointer to the second media register size operand (input).
3718;
3719%macro IEMIMPL_MEDIA_OPT_F2 2
3720%if %2 != 0
3721BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3722 PROLOGUE_2_ARGS
3723 IEMIMPL_MMX_PROLOGUE
3724
3725 movq mm0, [A0]
3726 movq mm1, [A1]
3727 %1 mm0, mm1
3728 movq [A0], mm0
3729
3730 IEMIMPL_MMX_EPILOGUE
3731 EPILOGUE_2_ARGS
3732ENDPROC iemAImpl_ %+ %1 %+ _u64
3733%endif
3734
3735BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3736 PROLOGUE_2_ARGS
3737 IEMIMPL_SSE_PROLOGUE
3738
3739 movdqu xmm0, [A0]
3740 movdqu xmm1, [A1]
3741 %1 xmm0, xmm1
3742 movdqu [A0], xmm0
3743
3744 IEMIMPL_SSE_EPILOGUE
3745 EPILOGUE_2_ARGS
3746ENDPROC iemAImpl_ %+ %1 %+ _u128
3747%endmacro
3748
3749IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3750IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3751IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3752IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3753IEMIMPL_MEDIA_OPT_F2 psllw, 1
3754IEMIMPL_MEDIA_OPT_F2 pslld, 1
3755IEMIMPL_MEDIA_OPT_F2 psllq, 1
3756IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3757IEMIMPL_MEDIA_OPT_F2 psrld, 1
3758IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3759IEMIMPL_MEDIA_OPT_F2 psraw, 1
3760IEMIMPL_MEDIA_OPT_F2 psrad, 1
3761IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3762IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3763IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3764IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3765IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3766IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3767IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3768IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3769IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3770IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3771IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3772IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3773IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3774IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3775IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3776IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3777IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3778IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3779IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3780IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3781
3782;;
3783; Media instruction working on one full sized and one half sized register (lower half).
3784;
3785; @param 1 The instruction
3786; @param 2 1 if MMX is included, 0 if not.
3787;
3788; @param A0 Pointer to the first full sized media register operand (input/output).
3789; @param A1 Pointer to the second half sized media register operand (input).
3790;
3791%macro IEMIMPL_MEDIA_F1L1 2
3792 %if %2 != 0
3793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3794 PROLOGUE_2_ARGS
3795 IEMIMPL_MMX_PROLOGUE
3796
3797 movq mm0, [A0]
3798 movq mm1, [A1]
3799 %1 mm0, mm1
3800 movq [A0], mm0
3801
3802 IEMIMPL_MMX_EPILOGUE
3803 EPILOGUE_2_ARGS
3804ENDPROC iemAImpl_ %+ %1 %+ _u64
3805 %endif
3806
3807BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3808 PROLOGUE_2_ARGS
3809 IEMIMPL_SSE_PROLOGUE
3810
3811 movdqu xmm0, [A0]
3812 movdqu xmm1, [A1]
3813 %1 xmm0, xmm1
3814 movdqu [A0], xmm0
3815
3816 IEMIMPL_SSE_EPILOGUE
3817 EPILOGUE_2_ARGS
3818ENDPROC iemAImpl_ %+ %1 %+ _u128
3819%endmacro
3820
3821IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3822IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3823IEMIMPL_MEDIA_F1L1 punpckldq, 1
3824IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3825
3826
3827;;
3828; Media instruction working two half sized input registers (lower half) and a full sized
3829; destination register (vpunpckh*).
3830;
3831; @param 1 The instruction
3832;
3833; @param A0 Pointer to the destination register (full sized, output only).
3834; @param A1 Pointer to the first full sized media source register operand, where we
3835; will only use the lower half as input - but we'll be loading it in full.
3836; @param A2 Pointer to the second full sized media source register operand, where we
3837; will only use the lower half as input - but we'll be loading it in full.
3838;
3839%macro IEMIMPL_MEDIA_F1L1L1 1
3840BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3841 PROLOGUE_3_ARGS
3842 IEMIMPL_AVX_PROLOGUE
3843
3844 vmovdqu xmm0, [A1]
3845 vmovdqu xmm1, [A2]
3846 %1 xmm0, xmm0, xmm1
3847 vmovdqu [A0], xmm0
3848
3849 IEMIMPL_AVX_PROLOGUE
3850 EPILOGUE_3_ARGS
3851ENDPROC iemAImpl_ %+ %1 %+ _u128
3852
3853BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3854 PROLOGUE_3_ARGS
3855 IEMIMPL_AVX_PROLOGUE
3856
3857 vmovdqu ymm0, [A1]
3858 vmovdqu ymm1, [A2]
3859 %1 ymm0, ymm0, ymm1
3860 vmovdqu [A0], ymm0
3861
3862 IEMIMPL_AVX_PROLOGUE
3863 EPILOGUE_3_ARGS
3864ENDPROC iemAImpl_ %+ %1 %+ _u256
3865%endmacro
3866
3867IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3868IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3869IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3870IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3871
3872
3873;;
3874; Media instruction working on one full sized and one half sized register (high half).
3875;
3876; @param 1 The instruction
3877; @param 2 1 if MMX is included, 0 if not.
3878;
3879; @param A0 Pointer to the first full sized media register operand (input/output).
3880; @param A1 Pointer to the second full sized media register operand, where we
3881; will only use the upper half as input - but we'll load it in full.
3882;
3883%macro IEMIMPL_MEDIA_F1H1 2
3884IEMIMPL_MEDIA_F1L1 %1, %2
3885%endmacro
3886
3887IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3888IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3889IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3890IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3891
3892
3893;;
3894; Media instruction working two half sized input registers (high half) and a full sized
3895; destination register (vpunpckh*).
3896;
3897; @param 1 The instruction
3898;
3899; @param A0 Pointer to the destination register (full sized, output only).
3900; @param A1 Pointer to the first full sized media source register operand, where we
3901; will only use the upper half as input - but we'll be loading it in full.
3902; @param A2 Pointer to the second full sized media source register operand, where we
3903; will only use the upper half as input - but we'll be loading it in full.
3904;
3905%macro IEMIMPL_MEDIA_F1H1H1 1
3906IEMIMPL_MEDIA_F1L1L1 %1
3907%endmacro
3908
3909IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3910IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3911IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3912IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3913
3914
3915;
3916; Shufflers with evil 8-bit immediates.
3917;
3918
3919BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3920 PROLOGUE_3_ARGS
3921 IEMIMPL_MMX_PROLOGUE
3922
3923 movq mm1, [A1]
3924 movq mm0, mm0 ; paranoia!
3925 lea T1, [.imm0 xWrtRIP]
3926 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3927 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
3928 %else
3929 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3930 %endif
3931 lea T1, [T1 + T0]
3932 IBT_NOTRACK
3933 call T1
3934 movq [A0], mm0
3935
3936 IEMIMPL_MMX_EPILOGUE
3937 EPILOGUE_3_ARGS
3938%assign bImm 0
3939%rep 256
3940.imm %+ bImm:
3941 IBT_ENDBRxx_WITHOUT_NOTRACK
3942 pshufw mm0, mm1, bImm
3943 ret
3944 %assign bImm bImm + 1
3945%endrep
3946.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
3947ENDPROC iemAImpl_pshufw_u64
3948
3949
3950%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3951BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3952 PROLOGUE_3_ARGS
3953 IEMIMPL_SSE_PROLOGUE
3954
3955 movdqu xmm1, [A1]
3956 movdqu xmm0, xmm1 ; paranoia!
3957 lea T1, [.imm0 xWrtRIP]
3958 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3959 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
3960 %else
3961 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
3962 %endif
3963 lea T1, [T1 + T0*2]
3964 IBT_NOTRACK
3965 call T1
3966 movdqu [A0], xmm0
3967
3968 IEMIMPL_SSE_EPILOGUE
3969 EPILOGUE_3_ARGS
3970
3971 %assign bImm 0
3972 %rep 256
3973.imm %+ bImm:
3974 IBT_ENDBRxx_WITHOUT_NOTRACK
3975 %1 xmm0, xmm1, bImm
3976 ret
3977 %assign bImm bImm + 1
3978 %endrep
3979.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
3980ENDPROC iemAImpl_ %+ %1 %+ _u128
3981%endmacro
3982
3983IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3984IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3985IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3986
3987
3988%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3990 PROLOGUE_3_ARGS
3991 IEMIMPL_SSE_PROLOGUE
3992
3993 vmovdqu ymm1, [A1]
3994 vmovdqu ymm0, ymm1 ; paranoia!
3995 lea T1, [.imm0 xWrtRIP]
3996 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3997 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
3998 %else
3999 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4000 %endif
4001 lea T1, [T1 + T0*2]
4002 IBT_NOTRACK
4003 call T1
4004 vmovdqu [A0], ymm0
4005
4006 IEMIMPL_SSE_EPILOGUE
4007 EPILOGUE_3_ARGS
4008 %assign bImm 0
4009 %rep 256
4010.imm %+ bImm:
4011 IBT_ENDBRxx_WITHOUT_NOTRACK
4012 %1 ymm0, ymm1, bImm
4013 ret
4014 %assign bImm bImm + 1
4015 %endrep
4016.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4017ENDPROC iemAImpl_ %+ %1 %+ _u256
4018%endmacro
4019
4020IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4021IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4022IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4023
4024
4025;
4026; Shifts with evil 8-bit immediates.
4027;
4028
4029%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4031 PROLOGUE_2_ARGS
4032 IEMIMPL_MMX_PROLOGUE
4033
4034 movq mm0, [A0]
4035 lea T1, [.imm0 xWrtRIP]
4036 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4037 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4038 %else
4039 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4040 %endif
4041 lea T1, [T1 + T0]
4042 IBT_NOTRACK
4043 call T1
4044 movq [A0], mm0
4045
4046 IEMIMPL_MMX_EPILOGUE
4047 EPILOGUE_2_ARGS
4048%assign bImm 0
4049%rep 256
4050.imm %+ bImm:
4051 IBT_ENDBRxx_WITHOUT_NOTRACK
4052 %1 mm0, bImm
4053 ret
4054 %assign bImm bImm + 1
4055%endrep
4056.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4057ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4058%endmacro
4059
4060IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4061IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4062IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4063IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4064IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4065IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4066IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4067IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4068
4069
4070%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4071BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4072 PROLOGUE_2_ARGS
4073 IEMIMPL_SSE_PROLOGUE
4074
4075 movdqu xmm0, [A0]
4076 lea T1, [.imm0 xWrtRIP]
4077 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4078 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4079 %else
4080 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4081 %endif
4082 lea T1, [T1 + T0*2]
4083 IBT_NOTRACK
4084 call T1
4085 movdqu [A0], xmm0
4086
4087 IEMIMPL_SSE_EPILOGUE
4088 EPILOGUE_2_ARGS
4089 %assign bImm 0
4090 %rep 256
4091.imm %+ bImm:
4092 IBT_ENDBRxx_WITHOUT_NOTRACK
4093 %1 xmm0, bImm
4094 ret
4095 %assign bImm bImm + 1
4096 %endrep
4097.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4098ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4099%endmacro
4100
4101IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4102IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4103IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4104IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4105IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4106IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4107IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4108IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4109IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4110IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4111
4112
4113;
4114; Move byte mask.
4115;
4116
4117BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4118 PROLOGUE_2_ARGS
4119 IEMIMPL_MMX_PROLOGUE
4120
4121 movq mm1, [A1]
4122 pmovmskb T0, mm1
4123 mov [A0], T0
4124%ifdef RT_ARCH_X86
4125 mov dword [A0 + 4], 0
4126%endif
4127 IEMIMPL_MMX_EPILOGUE
4128 EPILOGUE_2_ARGS
4129ENDPROC iemAImpl_pmovmskb_u64
4130
4131BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4132 PROLOGUE_2_ARGS
4133 IEMIMPL_SSE_PROLOGUE
4134
4135 movdqu xmm1, [A1]
4136 pmovmskb T0, xmm1
4137 mov [A0], T0
4138%ifdef RT_ARCH_X86
4139 mov dword [A0 + 4], 0
4140%endif
4141 IEMIMPL_SSE_EPILOGUE
4142 EPILOGUE_2_ARGS
4143ENDPROC iemAImpl_pmovmskb_u128
4144
4145BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4146 PROLOGUE_2_ARGS
4147 IEMIMPL_AVX_PROLOGUE
4148
4149 vmovdqu ymm1, [A1]
4150 vpmovmskb T0, ymm1
4151 mov [A0], T0
4152%ifdef RT_ARCH_X86
4153 mov dword [A0 + 4], 0
4154%endif
4155 IEMIMPL_AVX_EPILOGUE
4156 EPILOGUE_2_ARGS
4157ENDPROC iemAImpl_vpmovmskb_u256
4158
4159
4160;;
4161; Media instruction working on two full sized source registers and one destination (AVX).
4162;
4163; @param 1 The instruction
4164;
4165; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4166; @param A1 Pointer to the destination media register size operand (output).
4167; @param A2 Pointer to the first source media register size operand (input).
4168; @param A3 Pointer to the second source media register size operand (input).
4169;
4170%macro IEMIMPL_MEDIA_F3 1
4171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4172 PROLOGUE_4_ARGS
4173 IEMIMPL_AVX_PROLOGUE
4174
4175 vmovdqu xmm0, [A2]
4176 vmovdqu xmm1, [A3]
4177 %1 xmm0, xmm0, xmm1
4178 vmovdqu [A1], xmm0
4179
4180 IEMIMPL_AVX_PROLOGUE
4181 EPILOGUE_4_ARGS
4182ENDPROC iemAImpl_ %+ %1 %+ _u128
4183
4184BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4185 PROLOGUE_4_ARGS
4186 IEMIMPL_AVX_PROLOGUE
4187
4188 vmovdqu ymm0, [A2]
4189 vmovdqu ymm1, [A3]
4190 %1 ymm0, ymm0, ymm1
4191 vmovdqu [A1], ymm0
4192
4193 IEMIMPL_AVX_PROLOGUE
4194 EPILOGUE_4_ARGS
4195ENDPROC iemAImpl_ %+ %1 %+ _u256
4196%endmacro
4197
4198IEMIMPL_MEDIA_F3 vpshufb
4199IEMIMPL_MEDIA_F3 vpand
4200IEMIMPL_MEDIA_F3 vpminub
4201IEMIMPL_MEDIA_F3 vpminuw
4202IEMIMPL_MEDIA_F3 vpminud
4203IEMIMPL_MEDIA_F3 vpminsb
4204IEMIMPL_MEDIA_F3 vpminsw
4205IEMIMPL_MEDIA_F3 vpminsd
4206IEMIMPL_MEDIA_F3 vpmaxub
4207IEMIMPL_MEDIA_F3 vpmaxuw
4208IEMIMPL_MEDIA_F3 vpmaxud
4209IEMIMPL_MEDIA_F3 vpmaxsb
4210IEMIMPL_MEDIA_F3 vpmaxsw
4211IEMIMPL_MEDIA_F3 vpmaxsd
4212IEMIMPL_MEDIA_F3 vpandn
4213IEMIMPL_MEDIA_F3 vpor
4214IEMIMPL_MEDIA_F3 vpxor
4215IEMIMPL_MEDIA_F3 vpcmpeqb
4216IEMIMPL_MEDIA_F3 vpcmpeqw
4217IEMIMPL_MEDIA_F3 vpcmpeqd
4218IEMIMPL_MEDIA_F3 vpcmpeqq
4219IEMIMPL_MEDIA_F3 vpcmpgtb
4220IEMIMPL_MEDIA_F3 vpcmpgtw
4221IEMIMPL_MEDIA_F3 vpcmpgtd
4222IEMIMPL_MEDIA_F3 vpcmpgtq
4223IEMIMPL_MEDIA_F3 vpaddb
4224IEMIMPL_MEDIA_F3 vpaddw
4225IEMIMPL_MEDIA_F3 vpaddd
4226IEMIMPL_MEDIA_F3 vpaddq
4227IEMIMPL_MEDIA_F3 vpsubb
4228IEMIMPL_MEDIA_F3 vpsubw
4229IEMIMPL_MEDIA_F3 vpsubd
4230IEMIMPL_MEDIA_F3 vpsubq
4231
4232
4233;;
4234; Media instruction working on two full sized source registers and one destination (AVX),
4235; but no XSAVE state pointer argument.
4236;
4237; @param 1 The instruction
4238;
4239; @param A0 Pointer to the destination media register size operand (output).
4240; @param A1 Pointer to the first source media register size operand (input).
4241; @param A2 Pointer to the second source media register size operand (input).
4242;
4243%macro IEMIMPL_MEDIA_OPT_F3 1
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_AVX_PROLOGUE
4247
4248 vmovdqu xmm0, [A1]
4249 vmovdqu xmm1, [A2]
4250 %1 xmm0, xmm0, xmm1
4251 vmovdqu [A0], xmm0
4252
4253 IEMIMPL_AVX_PROLOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256
4257BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4258 PROLOGUE_3_ARGS
4259 IEMIMPL_AVX_PROLOGUE
4260
4261 vmovdqu ymm0, [A1]
4262 vmovdqu ymm1, [A2]
4263 %1 ymm0, ymm0, ymm1
4264 vmovdqu [A0], ymm0
4265
4266 IEMIMPL_AVX_PROLOGUE
4267 EPILOGUE_3_ARGS
4268ENDPROC iemAImpl_ %+ %1 %+ _u256
4269%endmacro
4270
4271IEMIMPL_MEDIA_OPT_F3 vpacksswb
4272IEMIMPL_MEDIA_OPT_F3 vpackssdw
4273IEMIMPL_MEDIA_OPT_F3 vpackuswb
4274IEMIMPL_MEDIA_OPT_F3 vpackusdw
4275IEMIMPL_MEDIA_OPT_F3 vpmullw
4276IEMIMPL_MEDIA_OPT_F3 vpmulld
4277IEMIMPL_MEDIA_OPT_F3 vpmulhw
4278IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4279IEMIMPL_MEDIA_OPT_F3 vpavgb
4280IEMIMPL_MEDIA_OPT_F3 vpavgw
4281IEMIMPL_MEDIA_OPT_F3 vpsignb
4282IEMIMPL_MEDIA_OPT_F3 vpsignw
4283IEMIMPL_MEDIA_OPT_F3 vpsignd
4284IEMIMPL_MEDIA_OPT_F3 vphaddw
4285IEMIMPL_MEDIA_OPT_F3 vphaddd
4286IEMIMPL_MEDIA_OPT_F3 vphsubw
4287IEMIMPL_MEDIA_OPT_F3 vphsubd
4288IEMIMPL_MEDIA_OPT_F3 vphaddsw
4289IEMIMPL_MEDIA_OPT_F3 vphsubsw
4290IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4291IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4292IEMIMPL_MEDIA_OPT_F3 vpsadbw
4293IEMIMPL_MEDIA_OPT_F3 vpmuldq
4294IEMIMPL_MEDIA_OPT_F3 vpmuludq
4295IEMIMPL_MEDIA_OPT_F3 vunpcklps
4296IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4297IEMIMPL_MEDIA_OPT_F3 vunpckhps
4298IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4299IEMIMPL_MEDIA_OPT_F3 vpsubsb
4300IEMIMPL_MEDIA_OPT_F3 vpsubsw
4301IEMIMPL_MEDIA_OPT_F3 vpsubusb
4302IEMIMPL_MEDIA_OPT_F3 vpsubusw
4303IEMIMPL_MEDIA_OPT_F3 vpaddusb
4304IEMIMPL_MEDIA_OPT_F3 vpaddusw
4305IEMIMPL_MEDIA_OPT_F3 vpaddsb
4306IEMIMPL_MEDIA_OPT_F3 vpaddsw
4307
4308
4309;;
4310; Media instruction working on one full sized source registers and one destination (AVX),
4311; but no XSAVE state pointer argument.
4312;
4313; @param 1 The instruction
4314; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4315;
4316; @param A0 Pointer to the destination media register size operand (output).
4317; @param A1 Pointer to the source media register size operand (input).
4318;
4319%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4320BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4321 PROLOGUE_2_ARGS
4322 IEMIMPL_AVX_PROLOGUE
4323
4324 vmovdqu xmm0, [A1]
4325 %1 xmm0, xmm0
4326 vmovdqu [A0], xmm0
4327
4328 IEMIMPL_AVX_PROLOGUE
4329 EPILOGUE_2_ARGS
4330ENDPROC iemAImpl_ %+ %1 %+ _u128
4331
4332 %if %2 == 1
4333BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4334 PROLOGUE_2_ARGS
4335 IEMIMPL_AVX_PROLOGUE
4336
4337 vmovdqu ymm0, [A1]
4338 %1 ymm0, ymm0
4339 vmovdqu [A0], ymm0
4340
4341 IEMIMPL_AVX_PROLOGUE
4342 EPILOGUE_2_ARGS
4343ENDPROC iemAImpl_ %+ %1 %+ _u256
4344 %endif
4345%endmacro
4346
4347IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4348IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4349IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4350IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4351
4352
4353;
4354; The SSE 4.2 crc32
4355;
4356; @param A1 Pointer to the 32-bit destination.
4357; @param A2 The source operand, sized according to the suffix.
4358;
4359BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4360 PROLOGUE_2_ARGS
4361
4362 mov T0_32, [A0]
4363 crc32 T0_32, A1_8
4364 mov [A0], T0_32
4365
4366 EPILOGUE_2_ARGS
4367ENDPROC iemAImpl_crc32_u8
4368
4369BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4370 PROLOGUE_2_ARGS
4371
4372 mov T0_32, [A0]
4373 crc32 T0_32, A1_16
4374 mov [A0], T0_32
4375
4376 EPILOGUE_2_ARGS
4377ENDPROC iemAImpl_crc32_u16
4378
4379BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4380 PROLOGUE_2_ARGS
4381
4382 mov T0_32, [A0]
4383 crc32 T0_32, A1_32
4384 mov [A0], T0_32
4385
4386 EPILOGUE_2_ARGS
4387ENDPROC iemAImpl_crc32_u32
4388
4389%ifdef RT_ARCH_AMD64
4390BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4391 PROLOGUE_2_ARGS
4392
4393 mov T0_32, [A0]
4394 crc32 T0, A1
4395 mov [A0], T0_32
4396
4397 EPILOGUE_2_ARGS
4398ENDPROC iemAImpl_crc32_u64
4399%endif
4400
4401
4402;
4403; PTEST (SSE 4.1)
4404;
4405; @param A0 Pointer to the first source operand (aka readonly destination).
4406; @param A1 Pointer to the second source operand.
4407; @param A2 Pointer to the EFLAGS register.
4408;
4409BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4410 PROLOGUE_3_ARGS
4411 IEMIMPL_SSE_PROLOGUE
4412
4413 movdqu xmm0, [A0]
4414 movdqu xmm1, [A1]
4415 ptest xmm0, xmm1
4416 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4417
4418 IEMIMPL_SSE_EPILOGUE
4419 EPILOGUE_3_ARGS
4420ENDPROC iemAImpl_ptest_u128
4421
4422BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4423 PROLOGUE_3_ARGS
4424 IEMIMPL_SSE_PROLOGUE
4425
4426 vmovdqu ymm0, [A0]
4427 vmovdqu ymm1, [A1]
4428 vptest ymm0, ymm1
4429 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4430
4431 IEMIMPL_SSE_EPILOGUE
4432 EPILOGUE_3_ARGS
4433ENDPROC iemAImpl_vptest_u256
4434
4435
4436;;
4437; Template for the [v]pmov{s,z}x* instructions
4438;
4439; @param 1 The instruction
4440;
4441; @param A0 Pointer to the destination media register size operand (output).
4442; @param A1 The source operand value (input).
4443;
4444%macro IEMIMPL_V_PMOV_SZ_X 1
4445BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4446 PROLOGUE_2_ARGS
4447 IEMIMPL_SSE_PROLOGUE
4448
4449 movd xmm0, A1
4450 %1 xmm0, xmm0
4451 vmovdqu [A0], xmm0
4452
4453 IEMIMPL_SSE_PROLOGUE
4454 EPILOGUE_2_ARGS
4455ENDPROC iemAImpl_ %+ %1 %+ _u128
4456
4457BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4458 PROLOGUE_2_ARGS
4459 IEMIMPL_AVX_PROLOGUE
4460
4461 movd xmm0, A1
4462 v %+ %1 xmm0, xmm0
4463 vmovdqu [A0], xmm0
4464
4465 IEMIMPL_AVX_PROLOGUE
4466 EPILOGUE_2_ARGS
4467ENDPROC iemAImpl_v %+ %1 %+ _u128
4468
4469BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4470 PROLOGUE_2_ARGS
4471 IEMIMPL_AVX_PROLOGUE
4472
4473 movdqu xmm0, [A1]
4474 v %+ %1 ymm0, xmm0
4475 vmovdqu [A0], ymm0
4476
4477 IEMIMPL_AVX_PROLOGUE
4478 EPILOGUE_2_ARGS
4479ENDPROC iemAImpl_v %+ %1 %+ _u256
4480%endmacro
4481
4482IEMIMPL_V_PMOV_SZ_X pmovsxbw
4483IEMIMPL_V_PMOV_SZ_X pmovsxbd
4484IEMIMPL_V_PMOV_SZ_X pmovsxbq
4485IEMIMPL_V_PMOV_SZ_X pmovsxwd
4486IEMIMPL_V_PMOV_SZ_X pmovsxwq
4487IEMIMPL_V_PMOV_SZ_X pmovsxdq
4488
4489IEMIMPL_V_PMOV_SZ_X pmovzxbw
4490IEMIMPL_V_PMOV_SZ_X pmovzxbd
4491IEMIMPL_V_PMOV_SZ_X pmovzxbq
4492IEMIMPL_V_PMOV_SZ_X pmovzxwd
4493IEMIMPL_V_PMOV_SZ_X pmovzxwq
4494IEMIMPL_V_PMOV_SZ_X pmovzxdq
4495
4496
4497;;
4498; Need to move this as well somewhere better?
4499;
4500struc IEMSSERESULT
4501 .uResult resd 4
4502 .MXCSR resd 1
4503endstruc
4504
4505
4506;;
4507; Need to move this as well somewhere better?
4508;
4509struc IEMAVX128RESULT
4510 .uResult resd 4
4511 .MXCSR resd 1
4512endstruc
4513
4514
4515;;
4516; Need to move this as well somewhere better?
4517;
4518struc IEMAVX256RESULT
4519 .uResult resd 8
4520 .MXCSR resd 1
4521endstruc
4522
4523
4524;;
4525; Initialize the SSE MXCSR register using the guest value partially to
4526; account for rounding mode.
4527;
4528; @uses 4 bytes of stack to save the original value, T0.
4529; @param 1 Expression giving the address of the FXSTATE of the guest.
4530;
4531%macro SSE_LD_FXSTATE_MXCSR 1
4532 sub xSP, 4
4533
4534 stmxcsr [xSP]
4535 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4536 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4537 or T0_32, X86_MXCSR_XCPT_MASK
4538 sub xSP, 4
4539 mov [xSP], T0_32
4540 ldmxcsr [xSP]
4541 add xSP, 4
4542%endmacro
4543
4544
4545;;
4546; Restores the SSE MXCSR register with the original value.
4547;
4548; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4549; @param 1 Expression giving the address where to return the MXCSR value.
4550; @param 2 Expression giving the address of the FXSTATE of the guest.
4551;
4552; @note Restores the stack pointer.
4553;
4554%macro SSE_ST_FXSTATE_MXCSR 2
4555 sub xSP, 4
4556 stmxcsr [xSP]
4557 mov T0_32, [xSP]
4558 add xSP, 4
4559 ; Merge the status bits into the original MXCSR value.
4560 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4561 and T0_32, X86_MXCSR_XCPT_FLAGS
4562 or T0_32, T1_32
4563 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4564
4565 ldmxcsr [xSP]
4566 add xSP, 4
4567%endmacro
4568
4569
4570;;
4571; Initialize the SSE MXCSR register using the guest value partially to
4572; account for rounding mode.
4573;
4574; @uses 4 bytes of stack to save the original value.
4575; @param 1 Expression giving the address of the FXSTATE of the guest.
4576;
4577%macro AVX_LD_XSAVEAREA_MXCSR 1
4578 sub xSP, 4
4579
4580 stmxcsr [xSP]
4581 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4582 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4583 sub xSP, 4
4584 mov [xSP], T0_32
4585 ldmxcsr [xSP]
4586 add xSP, 4
4587%endmacro
4588
4589
4590;;
4591; Restores the AVX128 MXCSR register with the original value.
4592;
4593; @param 1 Expression giving the address where to return the MXCSR value.
4594;
4595; @note Restores the stack pointer.
4596;
4597%macro AVX128_ST_XSAVEAREA_MXCSR 1
4598 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4599
4600 ldmxcsr [xSP]
4601 add xSP, 4
4602%endmacro
4603
4604
4605;;
4606; Restores the AVX256 MXCSR register with the original value.
4607;
4608; @param 1 Expression giving the address where to return the MXCSR value.
4609;
4610; @note Restores the stack pointer.
4611;
4612%macro AVX256_ST_XSAVEAREA_MXCSR 1
4613 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4614
4615 ldmxcsr [xSP]
4616 add xSP, 4
4617%endmacro
4618
4619
4620;;
4621; Floating point instruction working on two full sized registers.
4622;
4623; @param 1 The instruction
4624; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4625;
4626; @param A0 FPU context (FXSTATE or XSAVEAREA).
4627; @param A1 Where to return the result including the MXCSR value.
4628; @param A2 Pointer to the first media register size operand (input/output).
4629; @param A3 Pointer to the second media register size operand (input).
4630;
4631%macro IEMIMPL_FP_F2 2
4632BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4633 PROLOGUE_4_ARGS
4634 IEMIMPL_SSE_PROLOGUE
4635 SSE_LD_FXSTATE_MXCSR A0
4636
4637 movdqu xmm0, [A2]
4638 movdqu xmm1, [A3]
4639 %1 xmm0, xmm1
4640 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4641
4642 SSE_ST_FXSTATE_MXCSR A1, A0
4643 IEMIMPL_SSE_PROLOGUE
4644 EPILOGUE_4_ARGS
4645ENDPROC iemAImpl_ %+ %1 %+ _u128
4646
4647 %if %2 == 3
4648BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4649 PROLOGUE_4_ARGS
4650 IEMIMPL_AVX_PROLOGUE
4651 AVX_LD_XSAVEAREA_MXCSR A0
4652
4653 vmovdqu xmm0, [A2]
4654 vmovdqu xmm1, [A3]
4655 v %+ %1 xmm0, xmm0, xmm1
4656 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4657
4658 AVX128_ST_XSAVEAREA_MXCSR A1
4659 IEMIMPL_AVX_PROLOGUE
4660 EPILOGUE_4_ARGS
4661ENDPROC iemAImpl_v %+ %1 %+ _u128
4662
4663BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4664 PROLOGUE_4_ARGS
4665 IEMIMPL_AVX_PROLOGUE
4666 AVX_LD_XSAVEAREA_MXCSR A0
4667
4668 vmovdqu ymm0, [A2]
4669 vmovdqu ymm1, [A3]
4670 v %+ %1 ymm0, ymm0, ymm1
4671 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4672
4673 AVX256_ST_XSAVEAREA_MXCSR A1
4674 IEMIMPL_AVX_PROLOGUE
4675 EPILOGUE_4_ARGS
4676ENDPROC iemAImpl_v %+ %1 %+ _u256
4677 %elif %2 == 2
4678BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4679 PROLOGUE_4_ARGS
4680 IEMIMPL_AVX_PROLOGUE
4681 AVX_LD_XSAVEAREA_MXCSR A0
4682
4683 vmovdqu xmm0, [A2]
4684 vmovdqu xmm1, [A3]
4685 v %+ %1 xmm0, xmm1
4686 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4687
4688 AVX128_ST_XSAVEAREA_MXCSR A1
4689 IEMIMPL_AVX_PROLOGUE
4690 EPILOGUE_4_ARGS
4691ENDPROC iemAImpl_v %+ %1 %+ _u128
4692
4693BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4694 PROLOGUE_4_ARGS
4695 IEMIMPL_AVX_PROLOGUE
4696 AVX_LD_XSAVEAREA_MXCSR A0
4697
4698 vmovdqu ymm0, [A2]
4699 vmovdqu ymm1, [A3]
4700 v %+ %1 ymm0, ymm1
4701 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4702
4703 AVX256_ST_XSAVEAREA_MXCSR A1
4704 IEMIMPL_AVX_PROLOGUE
4705 EPILOGUE_4_ARGS
4706ENDPROC iemAImpl_v %+ %1 %+ _u256
4707 %endif
4708%endmacro
4709
4710IEMIMPL_FP_F2 addps, 3
4711IEMIMPL_FP_F2 addpd, 3
4712IEMIMPL_FP_F2 mulps, 3
4713IEMIMPL_FP_F2 mulpd, 3
4714IEMIMPL_FP_F2 subps, 3
4715IEMIMPL_FP_F2 subpd, 3
4716IEMIMPL_FP_F2 minps, 3
4717IEMIMPL_FP_F2 minpd, 3
4718IEMIMPL_FP_F2 divps, 3
4719IEMIMPL_FP_F2 divpd, 3
4720IEMIMPL_FP_F2 maxps, 3
4721IEMIMPL_FP_F2 maxpd, 3
4722IEMIMPL_FP_F2 haddps, 3
4723IEMIMPL_FP_F2 haddpd, 3
4724IEMIMPL_FP_F2 hsubps, 3
4725IEMIMPL_FP_F2 hsubpd, 3
4726IEMIMPL_FP_F2 addsubps, 3
4727IEMIMPL_FP_F2 addsubpd, 3
4728
4729
4730;;
4731; These are actually unary operations but to keep it simple
4732; we treat them as binary for now, so the output result is
4733; always in sync with the register where the result might get written
4734; to.
4735IEMIMPL_FP_F2 sqrtps, 2
4736IEMIMPL_FP_F2 rsqrtps, 2
4737IEMIMPL_FP_F2 sqrtpd, 2
4738IEMIMPL_FP_F2 cvtdq2ps, 2
4739IEMIMPL_FP_F2 cvtps2dq, 2
4740IEMIMPL_FP_F2 cvttps2dq, 2
4741IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4742IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4743IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4744
4745
4746;;
4747; Floating point instruction working on a full sized register and a single precision operand.
4748;
4749; @param 1 The instruction
4750;
4751; @param A0 FPU context (FXSTATE or XSAVEAREA).
4752; @param A1 Where to return the result including the MXCSR value.
4753; @param A2 Pointer to the first media register size operand (input/output).
4754; @param A3 Pointer to the second single precision floating point value (input).
4755;
4756%macro IEMIMPL_FP_F2_R32 1
4757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4758 PROLOGUE_4_ARGS
4759 IEMIMPL_SSE_PROLOGUE
4760 SSE_LD_FXSTATE_MXCSR A0
4761
4762 movdqu xmm0, [A2]
4763 movd xmm1, [A3]
4764 %1 xmm0, xmm1
4765 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4766
4767 SSE_ST_FXSTATE_MXCSR A1, A0
4768 IEMIMPL_SSE_EPILOGUE
4769 EPILOGUE_4_ARGS
4770ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4771
4772BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4773 PROLOGUE_4_ARGS
4774 IEMIMPL_AVX_PROLOGUE
4775 AVX_LD_XSAVEAREA_MXCSR A0
4776
4777 vmovdqu xmm0, [A2]
4778 vmovd xmm1, [A3]
4779 v %+ %1 xmm0, xmm0, xmm1
4780 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4781
4782 AVX128_ST_XSAVEAREA_MXCSR A1
4783 IEMIMPL_AVX_PROLOGUE
4784 EPILOGUE_4_ARGS
4785ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4786%endmacro
4787
4788IEMIMPL_FP_F2_R32 addss
4789IEMIMPL_FP_F2_R32 mulss
4790IEMIMPL_FP_F2_R32 subss
4791IEMIMPL_FP_F2_R32 minss
4792IEMIMPL_FP_F2_R32 divss
4793IEMIMPL_FP_F2_R32 maxss
4794IEMIMPL_FP_F2_R32 cvtss2sd
4795IEMIMPL_FP_F2_R32 sqrtss
4796IEMIMPL_FP_F2_R32 rsqrtss
4797
4798
4799;;
4800; Floating point instruction working on a full sized register and a double precision operand.
4801;
4802; @param 1 The instruction
4803;
4804; @param A0 FPU context (FXSTATE or XSAVEAREA).
4805; @param A1 Where to return the result including the MXCSR value.
4806; @param A2 Pointer to the first media register size operand (input/output).
4807; @param A3 Pointer to the second double precision floating point value (input).
4808;
4809%macro IEMIMPL_FP_F2_R64 1
4810BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4811 PROLOGUE_4_ARGS
4812 IEMIMPL_SSE_PROLOGUE
4813 SSE_LD_FXSTATE_MXCSR A0
4814
4815 movdqu xmm0, [A2]
4816 movq xmm1, [A3]
4817 %1 xmm0, xmm1
4818 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4819
4820 SSE_ST_FXSTATE_MXCSR A1, A0
4821 IEMIMPL_SSE_EPILOGUE
4822 EPILOGUE_4_ARGS
4823ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4824
4825BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4826 PROLOGUE_4_ARGS
4827 IEMIMPL_AVX_PROLOGUE
4828 AVX_LD_XSAVEAREA_MXCSR A0
4829
4830 vmovdqu xmm0, [A2]
4831 vmovq xmm1, [A3]
4832 v %+ %1 xmm0, xmm0, xmm1
4833 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4834
4835 AVX128_ST_XSAVEAREA_MXCSR A1
4836 IEMIMPL_AVX_EPILOGUE
4837 EPILOGUE_4_ARGS
4838ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4839%endmacro
4840
4841IEMIMPL_FP_F2_R64 addsd
4842IEMIMPL_FP_F2_R64 mulsd
4843IEMIMPL_FP_F2_R64 subsd
4844IEMIMPL_FP_F2_R64 minsd
4845IEMIMPL_FP_F2_R64 divsd
4846IEMIMPL_FP_F2_R64 maxsd
4847IEMIMPL_FP_F2_R64 cvtsd2ss
4848IEMIMPL_FP_F2_R64 sqrtsd
4849
4850
4851;;
4852; Macro for the cvtpd2ps/cvtps2pd instructions.
4853;
4854; 1 The instruction name.
4855; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4856;
4857; @param A0 FPU context (FXSTATE or XSAVEAREA).
4858; @param A1 Where to return the result including the MXCSR value.
4859; @param A2 Pointer to the first media register size operand (input/output).
4860; @param A3 Pointer to the second media register size operand (input).
4861;
4862%macro IEMIMPL_CVT_F2 2
4863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4864 PROLOGUE_4_ARGS
4865 IEMIMPL_SSE_PROLOGUE
4866 SSE_LD_FXSTATE_MXCSR A0
4867
4868 movdqu xmm0, [A2]
4869 movdqu xmm1, [A3]
4870 %1 xmm0, xmm1
4871 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4872
4873 SSE_ST_FXSTATE_MXCSR A1, A0
4874 IEMIMPL_SSE_EPILOGUE
4875 EPILOGUE_4_ARGS
4876ENDPROC iemAImpl_ %+ %1 %+ _u128
4877
4878BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4879 PROLOGUE_4_ARGS
4880 IEMIMPL_AVX_PROLOGUE
4881 AVX_LD_XSAVEAREA_MXCSR A0
4882
4883 vmovdqu xmm0, [A2]
4884 vmovdqu xmm1, [A3]
4885 v %+ %1 xmm0, xmm1
4886 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4887
4888 AVX128_ST_XSAVEAREA_MXCSR A1
4889 IEMIMPL_AVX_EPILOGUE
4890 EPILOGUE_4_ARGS
4891ENDPROC iemAImpl_v %+ %1 %+ _u128
4892
4893BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4894 PROLOGUE_4_ARGS
4895 IEMIMPL_AVX_PROLOGUE
4896 AVX_LD_XSAVEAREA_MXCSR A0
4897
4898 vmovdqu ymm0, [A2]
4899 vmovdqu ymm1, [A3]
4900 %if %2 == 0
4901 v %+ %1 xmm0, ymm1
4902 %else
4903 v %+ %1 ymm0, xmm1
4904 %endif
4905 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4906
4907 AVX256_ST_XSAVEAREA_MXCSR A1
4908 IEMIMPL_AVX_EPILOGUE
4909 EPILOGUE_4_ARGS
4910ENDPROC iemAImpl_v %+ %1 %+ _u256
4911%endmacro
4912
4913IEMIMPL_CVT_F2 cvtpd2ps, 0
4914IEMIMPL_CVT_F2 cvtps2pd, 1
4915
4916
4917;;
4918; shufps instructions with 8-bit immediates.
4919;
4920; @param A0 Pointer to the destination media register size operand (input/output).
4921; @param A1 Pointer to the first source media register size operand (input).
4922; @param A2 The 8-bit immediate
4923;
4924BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4925 PROLOGUE_3_ARGS
4926 IEMIMPL_SSE_PROLOGUE
4927
4928 movdqu xmm0, [A0]
4929 movdqu xmm1, [A1]
4930 lea T1, [.imm0 xWrtRIP]
4931 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4932 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
4933 %else
4934 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
4935 %endif
4936 lea T1, [T1 + T0*2]
4937 IBT_NOTRACK
4938 call T1
4939 movdqu [A0], xmm0
4940
4941 IEMIMPL_SSE_EPILOGUE
4942 EPILOGUE_3_ARGS
4943 %assign bImm 0
4944 %rep 256
4945.imm %+ bImm:
4946 IBT_ENDBRxx_WITHOUT_NOTRACK
4947 shufps xmm0, xmm1, bImm
4948 ret
4949 int3
4950 %assign bImm bImm + 1
4951 %endrep
4952.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4953ENDPROC iemAImpl_shufps_u128
4954
4955
4956;;
4957; shufpd instruction with 8-bit immediates.
4958;
4959; @param A0 Pointer to the destination media register size operand (input/output).
4960; @param A1 Pointer to the first source media register size operand (input).
4961; @param A2 The 8-bit immediate
4962;
4963BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4964 PROLOGUE_3_ARGS
4965 IEMIMPL_SSE_PROLOGUE
4966
4967 movdqu xmm0, [A0]
4968 movdqu xmm1, [A1]
4969 lea T1, [.imm0 xWrtRIP]
4970 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4971 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4972 %else
4973 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4974 %endif
4975 lea T1, [T1 + T0*2]
4976 IBT_NOTRACK
4977 call T1
4978 movdqu [A0], xmm0
4979
4980 IEMIMPL_SSE_EPILOGUE
4981 EPILOGUE_3_ARGS
4982 %assign bImm 0
4983 %rep 256
4984.imm %+ bImm:
4985 IBT_ENDBRxx_WITHOUT_NOTRACK
4986 shufpd xmm0, xmm1, bImm
4987 ret
4988 %assign bImm bImm + 1
4989 %endrep
4990.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4991ENDPROC iemAImpl_shufpd_u128
4992
4993
4994;;
4995; vshufp{s,d} instructions with 8-bit immediates.
4996;
4997; @param 1 The instruction name.
4998;
4999; @param A0 Pointer to the destination media register size operand (output).
5000; @param A1 Pointer to the first source media register size operand (input).
5001; @param A2 Pointer to the second source media register size operand (input).
5002; @param A3 The 8-bit immediate
5003;
5004%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5006 PROLOGUE_4_ARGS
5007 IEMIMPL_AVX_PROLOGUE
5008
5009 movdqu xmm0, [A1]
5010 movdqu xmm1, [A2]
5011 lea T1, [.imm0 xWrtRIP]
5012 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5013 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5014 %else
5015 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5016 %endif
5017 lea T1, [T1 + T0*2]
5018 IBT_NOTRACK
5019 call T1
5020 movdqu [A0], xmm0
5021
5022 IEMIMPL_AVX_EPILOGUE
5023 EPILOGUE_4_ARGS
5024 %assign bImm 0
5025 %rep 256
5026.imm %+ bImm:
5027 IBT_ENDBRxx_WITHOUT_NOTRACK
5028 %1 xmm0, xmm0, xmm1, bImm
5029 ret
5030 %assign bImm bImm + 1
5031 %endrep
5032.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5033ENDPROC iemAImpl_ %+ %1 %+ _u128
5034
5035BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5036 PROLOGUE_4_ARGS
5037 IEMIMPL_AVX_PROLOGUE
5038
5039 vmovdqu ymm0, [A1]
5040 vmovdqu ymm1, [A2]
5041 lea T1, [.imm0 xWrtRIP]
5042 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5043 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5044 %else
5045 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5046 %endif
5047 lea T1, [T1 + T0*2]
5048 IBT_NOTRACK
5049 call T1
5050 vmovdqu [A0], ymm0
5051
5052 IEMIMPL_AVX_EPILOGUE
5053 EPILOGUE_4_ARGS
5054 %assign bImm 0
5055 %rep 256
5056.imm %+ bImm:
5057 IBT_ENDBRxx_WITHOUT_NOTRACK
5058 %1 ymm0, ymm0, ymm1, bImm
5059 ret
5060 %assign bImm bImm + 1
5061 %endrep
5062.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5063ENDPROC iemAImpl_ %+ %1 %+ _u256
5064%endmacro
5065
5066IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5067IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5068
5069
5070;;
5071; One of the [p]blendv{b,ps,pd} variants
5072;
5073; @param 1 The instruction
5074;
5075; @param A0 Pointer to the first media register sized operand (input/output).
5076; @param A1 Pointer to the second media sized value (input).
5077; @param A2 Pointer to the media register sized mask value (input).
5078;
5079%macro IEMIMPL_P_BLEND 1
5080BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5081 PROLOGUE_3_ARGS
5082 IEMIMPL_SSE_PROLOGUE
5083
5084 movdqu xmm0, [A2] ; This is implicit
5085 movdqu xmm1, [A0]
5086 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5087 %1 xmm1, xmm2
5088 movdqu [A0], xmm1
5089
5090 IEMIMPL_SSE_PROLOGUE
5091 EPILOGUE_3_ARGS
5092ENDPROC iemAImpl_ %+ %1 %+ _u128
5093%endmacro
5094
5095IEMIMPL_P_BLEND pblendvb
5096IEMIMPL_P_BLEND blendvps
5097IEMIMPL_P_BLEND blendvpd
5098
5099
5100;;
5101; One of the v[p]blendv{b,ps,pd} variants
5102;
5103; @param 1 The instruction
5104;
5105; @param A0 Pointer to the first media register sized operand (output).
5106; @param A1 Pointer to the first media register sized operand (input).
5107; @param A2 Pointer to the second media register sized operand (input).
5108; @param A3 Pointer to the media register sized mask value (input).
5109%macro IEMIMPL_AVX_P_BLEND 1
5110BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5111 PROLOGUE_4_ARGS
5112 IEMIMPL_AVX_PROLOGUE
5113
5114 vmovdqu xmm0, [A1]
5115 vmovdqu xmm1, [A2]
5116 vmovdqu xmm2, [A3]
5117 %1 xmm0, xmm0, xmm1, xmm2
5118 vmovdqu [A0], xmm0
5119
5120 IEMIMPL_AVX_PROLOGUE
5121 EPILOGUE_4_ARGS
5122ENDPROC iemAImpl_ %+ %1 %+ _u128
5123
5124BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5125 PROLOGUE_4_ARGS
5126 IEMIMPL_AVX_PROLOGUE
5127
5128 vmovdqu ymm0, [A1]
5129 vmovdqu ymm1, [A2]
5130 vmovdqu ymm2, [A3]
5131 %1 ymm0, ymm0, ymm1, ymm2
5132 vmovdqu [A0], ymm0
5133
5134 IEMIMPL_AVX_PROLOGUE
5135 EPILOGUE_4_ARGS
5136ENDPROC iemAImpl_ %+ %1 %+ _u256
5137%endmacro
5138
5139IEMIMPL_AVX_P_BLEND vpblendvb
5140IEMIMPL_AVX_P_BLEND vblendvps
5141IEMIMPL_AVX_P_BLEND vblendvpd
5142
5143
5144;;
5145; palignr mm1, mm2/m64 instruction.
5146;
5147; @param A0 Pointer to the first media register sized operand (output).
5148; @param A1 The second register sized operand (input).
5149; @param A2 The 8-bit immediate.
5150BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5151 PROLOGUE_3_ARGS
5152 IEMIMPL_MMX_PROLOGUE
5153
5154 movq mm0, [A0]
5155 movq mm1, A1
5156 lea T1, [.imm0 xWrtRIP]
5157 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5158 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5159 %else
5160 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5161 %endif
5162 lea T1, [T1 + T0*2]
5163 IBT_NOTRACK
5164 call T1
5165 movq [A0], mm0
5166
5167 IEMIMPL_MMX_EPILOGUE
5168 EPILOGUE_3_ARGS
5169 %assign bImm 0
5170 %rep 256
5171.imm %+ bImm:
5172 IBT_ENDBRxx_WITHOUT_NOTRACK
5173 palignr mm0, mm1, bImm
5174 ret
5175 %assign bImm bImm + 1
5176 %endrep
5177.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5178ENDPROC iemAImpl_palignr_u64
5179
5180
5181;;
5182; SSE instructions with 8-bit immediates of the form
5183; xxx xmm1, xmm2, imm8.
5184; where the instruction encoding takes up 6 bytes.
5185;
5186; @param 1 The instruction name.
5187;
5188; @param A0 Pointer to the first media register size operand (input/output).
5189; @param A1 Pointer to the second source media register size operand (input).
5190; @param A2 The 8-bit immediate
5191;
5192%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5193BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5194 PROLOGUE_3_ARGS
5195 IEMIMPL_SSE_PROLOGUE
5196
5197 movdqu xmm0, [A0]
5198 movdqu xmm1, [A1]
5199 lea T1, [.imm0 xWrtRIP]
5200 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5201 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5202 lea T1, [T1 + T0*4]
5203 %else
5204 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5205 %endif
5206 IBT_NOTRACK
5207 call T1
5208 movdqu [A0], xmm0
5209
5210 IEMIMPL_SSE_EPILOGUE
5211 EPILOGUE_3_ARGS
5212 %assign bImm 0
5213 %rep 256
5214.imm %+ bImm:
5215 IBT_ENDBRxx_WITHOUT_NOTRACK
5216 %1 xmm0, xmm1, bImm
5217 ret
5218 int3
5219 %assign bImm bImm + 1
5220 %endrep
5221.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5222ENDPROC iemAImpl_ %+ %1 %+ _u128
5223%endmacro
5224
5225IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5226IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5227IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5228IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5229IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5230IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5231IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5232
5233
5234;;
5235; AVX instructions with 8-bit immediates of the form
5236; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5237; where the instruction encoding takes up 6 bytes.
5238;
5239; @param 1 The instruction name.
5240; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5241;
5242; @param A0 Pointer to the destination media register size operand (output).
5243; @param A1 Pointer to the first source media register size operand (input).
5244; @param A2 Pointer to the second source media register size operand (input).
5245; @param A3 The 8-bit immediate
5246;
5247%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5248BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5249 PROLOGUE_4_ARGS
5250 IEMIMPL_AVX_PROLOGUE
5251
5252 movdqu xmm0, [A1]
5253 movdqu xmm1, [A2]
5254 lea T1, [.imm0 xWrtRIP]
5255 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5256 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5257 lea T1, [T1 + T0*4]
5258 %else
5259 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5260 %endif
5261 IBT_NOTRACK
5262 call T1
5263 movdqu [A0], xmm0
5264
5265 IEMIMPL_AVX_EPILOGUE
5266 EPILOGUE_4_ARGS
5267 %assign bImm 0
5268 %rep 256
5269.imm %+ bImm:
5270 IBT_ENDBRxx_WITHOUT_NOTRACK
5271 %1 xmm0, xmm0, xmm1, bImm
5272 ret
5273 int3
5274 %assign bImm bImm + 1
5275 %endrep
5276.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5277ENDPROC iemAImpl_ %+ %1 %+ _u128
5278
5279 %if %2 == 1
5280BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5281 PROLOGUE_4_ARGS
5282 IEMIMPL_AVX_PROLOGUE
5283
5284 vmovdqu ymm0, [A1]
5285 vmovdqu ymm1, [A2]
5286 lea T1, [.imm0 xWrtRIP]
5287 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5288 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5289 lea T1, [T1 + T0*4]
5290 %else
5291 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5292 %endif
5293 IBT_NOTRACK
5294 call T1
5295 vmovdqu [A0], ymm0
5296
5297 IEMIMPL_AVX_EPILOGUE
5298 EPILOGUE_4_ARGS
5299 %assign bImm 0
5300 %rep 256
5301.imm %+ bImm:
5302 IBT_ENDBRxx_WITHOUT_NOTRACK
5303 %1 ymm0, ymm0, ymm1, bImm
5304 ret
5305 int3
5306 %assign bImm bImm + 1
5307 %endrep
5308.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5309ENDPROC iemAImpl_ %+ %1 %+ _u256
5310 %endif
5311%endmacro
5312
5313IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5314IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5315IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5316IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5317IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5318
5319
5320;;
5321; Need to move this as well somewhere better?
5322;
5323struc IEMPCMPISTRXSRC
5324 .uSrc1 resd 4
5325 .uSrc2 resd 4
5326endstruc
5327
5328struc IEMPCMPESTRXSRC
5329 .uSrc1 resd 4
5330 .uSrc2 resd 4
5331 .u64Rax resd 2
5332 .u64Rdx resd 2
5333endstruc
5334
5335;;
5336; The pcmpistri instruction.
5337;
5338; @param A0 Pointer to the ECX register to store the result to (output).
5339; @param A1 Pointer to the EFLAGS register.
5340; @param A2 Pointer to the structure containing the source operands (input).
5341; @param A3 The 8-bit immediate
5342;
5343BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5344 PROLOGUE_4_ARGS
5345 IEMIMPL_SSE_PROLOGUE
5346
5347 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5348 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5349 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5350 lea T1, [.imm0 xWrtRIP]
5351 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5352 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5353 lea T1, [T1 + T0*4]
5354 %else
5355 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5356 %endif
5357 IBT_NOTRACK
5358 call T1
5359
5360 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5361 mov [T2], ecx
5362
5363 IEMIMPL_SSE_EPILOGUE
5364 EPILOGUE_4_ARGS
5365 %assign bImm 0
5366 %rep 256
5367.imm %+ bImm:
5368 IBT_ENDBRxx_WITHOUT_NOTRACK
5369 pcmpistri xmm0, xmm1, bImm
5370 ret
5371 int3
5372 %assign bImm bImm + 1
5373 %endrep
5374.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5375ENDPROC iemAImpl_pcmpistri_u128
5376
5377;;
5378; The pcmpestri instruction.
5379;
5380; @param A0 Pointer to the ECX register to store the result to (output).
5381; @param A1 Pointer to the EFLAGS register.
5382; @param A2 Pointer to the structure containing the source operands (input).
5383; @param A3 The 8-bit immediate
5384;
5385BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5386 PROLOGUE_4_ARGS
5387 IEMIMPL_SSE_PROLOGUE
5388
5389 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5390 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5391 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5392 lea T1, [.imm0 xWrtRIP]
5393 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5394 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5395 lea T1, [T1 + T0*4]
5396 %else
5397 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5398 %endif
5399 push xDX ; xDX can be A1 or A2 depending on the calling convention
5400 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5401 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5402 IBT_NOTRACK
5403 call T1
5404
5405 pop xDX
5406 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5407 mov [T2], ecx
5408
5409 IEMIMPL_SSE_EPILOGUE
5410 EPILOGUE_4_ARGS
5411 %assign bImm 0
5412 %rep 256
5413.imm %+ bImm:
5414 IBT_ENDBRxx_WITHOUT_NOTRACK
5415 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5416 pcmpestri xmm0, xmm1, bImm
5417 ret
5418 %assign bImm bImm + 1
5419 %endrep
5420.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5421ENDPROC iemAImpl_pcmpestri_u128
5422
5423;;
5424; The pcmpistrm instruction template.
5425;
5426; @param A0 Pointer to the XMM0 register to store the result to (output).
5427; @param A1 Pointer to the EFLAGS register.
5428; @param A2 Pointer to the structure containing the source operands (input).
5429; @param A3 The 8-bit immediate
5430;
5431BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5432 PROLOGUE_4_ARGS
5433 IEMIMPL_SSE_PROLOGUE
5434
5435 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5436 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5437 lea T1, [.imm0 xWrtRIP]
5438 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5439 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5440 lea T1, [T1 + T0*4]
5441 %else
5442 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5443 %endif
5444 IBT_NOTRACK
5445 call T1
5446
5447 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5448 movdqu [A0], xmm0
5449
5450 IEMIMPL_SSE_EPILOGUE
5451 EPILOGUE_4_ARGS
5452 %assign bImm 0
5453 %rep 256
5454.imm %+ bImm:
5455 IBT_ENDBRxx_WITHOUT_NOTRACK
5456 pcmpistrm xmm1, xmm2, bImm
5457 ret
5458 int3
5459 %assign bImm bImm + 1
5460 %endrep
5461.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5462ENDPROC iemAImpl_pcmpistrm_u128
5463
5464;;
5465; The pcmpestrm instruction template.
5466;
5467; @param A0 Pointer to the XMM0 register to store the result to (output).
5468; @param A1 Pointer to the EFLAGS register.
5469; @param A2 Pointer to the structure containing the source operands (input).
5470; @param A3 The 8-bit immediate
5471;
5472BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5473 PROLOGUE_4_ARGS
5474 IEMIMPL_SSE_PROLOGUE
5475
5476 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5477 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5478 lea T1, [.imm0 xWrtRIP]
5479 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5480 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5481 lea T1, [T1 + T0*4]
5482 %else
5483 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5484 %endif
5485 push xDX ; xDX can be A1 or A2 depending on the calling convention
5486 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5487 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5488 IBT_NOTRACK
5489 call T1
5490
5491 pop xDX
5492 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5493 movdqu [A0], xmm0
5494
5495 IEMIMPL_SSE_EPILOGUE
5496 EPILOGUE_4_ARGS
5497 %assign bImm 0
5498 %rep 256
5499.imm %+ bImm:
5500 IBT_ENDBRxx_WITHOUT_NOTRACK
5501 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5502 pcmpestrm xmm1, xmm2, bImm
5503 ret
5504 %assign bImm bImm + 1
5505 %endrep
5506.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5507ENDPROC iemAImpl_pcmpestrm_u128
5508
5509
5510;;
5511; pinsrw instruction.
5512;
5513; @param A0 Pointer to the first media register size operand (input/output).
5514; @param A1 The 16 bit input operand (input).
5515; @param A2 The 8-bit immediate
5516;
5517BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5518 PROLOGUE_3_ARGS
5519 IEMIMPL_SSE_PROLOGUE
5520
5521 movq mm0, [A0]
5522 lea T1, [.imm0 xWrtRIP]
5523 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5524 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5525 %else
5526 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5527 %endif
5528 lea T1, [T1 + T0]
5529 IBT_NOTRACK
5530 call T1
5531 movq [A0], mm0
5532
5533 IEMIMPL_SSE_EPILOGUE
5534 EPILOGUE_3_ARGS
5535 %assign bImm 0
5536 %rep 256
5537.imm %+ bImm:
5538 IBT_ENDBRxx_WITHOUT_NOTRACK
5539 pinsrw mm0, A1_32, bImm
5540 ret
5541 %assign bImm bImm + 1
5542 %endrep
5543.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5544ENDPROC iemAImpl_pinsrw_u64
5545
5546BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5547 PROLOGUE_3_ARGS
5548 IEMIMPL_SSE_PROLOGUE
5549
5550 movdqu xmm0, [A0]
5551 lea T1, [.imm0 xWrtRIP]
5552 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5553 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5554 %else
5555 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5556 %endif
5557 lea T1, [T1 + T0*2]
5558 IBT_NOTRACK
5559 call T1
5560 movdqu [A0], xmm0
5561
5562 IEMIMPL_SSE_EPILOGUE
5563 EPILOGUE_3_ARGS
5564 %assign bImm 0
5565 %rep 256
5566.imm %+ bImm:
5567 IBT_ENDBRxx_WITHOUT_NOTRACK
5568 pinsrw xmm0, A1_32, bImm
5569 ret
5570 %assign bImm bImm + 1
5571 %endrep
5572.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5573ENDPROC iemAImpl_pinsrw_u128
5574
5575;;
5576; vpinsrw instruction.
5577;
5578; @param A0 Pointer to the first media register size operand (output).
5579; @param A1 Pointer to the source media register size operand (input).
5580; @param A2 The 16 bit input operand (input).
5581; @param A3 The 8-bit immediate
5582;
5583BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5584 PROLOGUE_4_ARGS
5585 IEMIMPL_SSE_PROLOGUE
5586
5587 movdqu xmm0, [A1]
5588 lea T1, [.imm0 xWrtRIP]
5589 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5590 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5591 %else
5592 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5593 %endif
5594 lea T1, [T1 + T0*2]
5595 mov A1, A2 ; A2 requires longer encoding on Windows
5596 IBT_NOTRACK
5597 call T1
5598 movdqu [A0], xmm0
5599
5600 IEMIMPL_SSE_EPILOGUE
5601 EPILOGUE_4_ARGS
5602 %assign bImm 0
5603 %rep 256
5604.imm %+ bImm:
5605 IBT_ENDBRxx_WITHOUT_NOTRACK
5606 vpinsrw xmm0, xmm0, A1_32, bImm
5607 ret
5608 %assign bImm bImm + 1
5609 %endrep
5610.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5611ENDPROC iemAImpl_vpinsrw_u128
5612
5613
5614;;
5615; pextrw instruction.
5616;
5617; @param A0 Pointer to the 16bit output operand (output).
5618; @param A1 Pointer to the media register size operand (input).
5619; @param A2 The 8-bit immediate
5620;
5621BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5622 PROLOGUE_3_ARGS
5623 IEMIMPL_SSE_PROLOGUE
5624
5625 movq mm0, A1
5626 lea T1, [.imm0 xWrtRIP]
5627 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5628 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5629 %else
5630 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5631 %endif
5632 lea T1, [T1 + T0]
5633 IBT_NOTRACK
5634 call T1
5635 mov word [A0], T0_16
5636
5637 IEMIMPL_SSE_EPILOGUE
5638 EPILOGUE_3_ARGS
5639 %assign bImm 0
5640 %rep 256
5641.imm %+ bImm:
5642 IBT_ENDBRxx_WITHOUT_NOTRACK
5643 pextrw T0_32, mm0, bImm
5644 ret
5645 %assign bImm bImm + 1
5646 %endrep
5647.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5648ENDPROC iemAImpl_pextrw_u64
5649
5650BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5651 PROLOGUE_3_ARGS
5652 IEMIMPL_SSE_PROLOGUE
5653
5654 movdqu xmm0, [A1]
5655 lea T1, [.imm0 xWrtRIP]
5656 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5657 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5658 %else
5659 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5660 %endif
5661 lea T1, [T1 + T0*2]
5662 IBT_NOTRACK
5663 call T1
5664 mov word [A0], T0_16
5665
5666 IEMIMPL_SSE_EPILOGUE
5667 EPILOGUE_3_ARGS
5668 %assign bImm 0
5669 %rep 256
5670.imm %+ bImm:
5671 IBT_ENDBRxx_WITHOUT_NOTRACK
5672 pextrw T0_32, xmm0, bImm
5673 ret
5674 %assign bImm bImm + 1
5675 %endrep
5676.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5677ENDPROC iemAImpl_pextrw_u128
5678
5679;;
5680; vpextrw instruction.
5681;
5682; @param A0 Pointer to the 16bit output operand (output).
5683; @param A1 Pointer to the source media register size operand (input).
5684; @param A2 The 8-bit immediate
5685;
5686BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5687 PROLOGUE_3_ARGS
5688 IEMIMPL_SSE_PROLOGUE
5689
5690 movdqu xmm0, [A1]
5691 lea T1, [.imm0 xWrtRIP]
5692 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5693 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5694 %else
5695 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5696 %endif
5697 lea T1, [T1 + T0*2]
5698 IBT_NOTRACK
5699 call T1
5700 mov word [A0], T0_16
5701
5702 IEMIMPL_SSE_EPILOGUE
5703 EPILOGUE_3_ARGS
5704 %assign bImm 0
5705 %rep 256
5706.imm %+ bImm:
5707 IBT_ENDBRxx_WITHOUT_NOTRACK
5708 vpextrw T0_32, xmm0, bImm
5709 ret
5710 %assign bImm bImm + 1
5711 %endrep
5712.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5713ENDPROC iemAImpl_vpextrw_u128
5714
5715
5716;;
5717; movmskp{s,d} SSE instruction template
5718;
5719; @param 1 The SSE instruction name.
5720; @param 2 The AVX instruction name.
5721;
5722; @param A0 Pointer to the output register (output/byte sized).
5723; @param A1 Pointer to the source media register size operand (input).
5724;
5725%macro IEMIMPL_MEDIA_MOVMSK_P 2
5726BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5727 PROLOGUE_2_ARGS
5728 IEMIMPL_SSE_PROLOGUE
5729
5730 movdqu xmm0, [A1]
5731 %1 T0, xmm0
5732 mov byte [A0], T0_8
5733
5734 IEMIMPL_SSE_EPILOGUE
5735 EPILOGUE_2_ARGS
5736ENDPROC iemAImpl_ %+ %1 %+ _u128
5737
5738BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5739 PROLOGUE_2_ARGS
5740 IEMIMPL_AVX_PROLOGUE
5741
5742 movdqu xmm0, [A1]
5743 %2 T0, xmm0
5744 mov byte [A0], T0_8
5745
5746 IEMIMPL_AVX_EPILOGUE
5747 EPILOGUE_2_ARGS
5748ENDPROC iemAImpl_ %+ %2 %+ _u128
5749
5750BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5751 PROLOGUE_2_ARGS
5752 IEMIMPL_AVX_PROLOGUE
5753
5754 vmovdqu ymm0, [A1]
5755 %2 T0, ymm0
5756 mov byte [A0], T0_8
5757
5758 IEMIMPL_AVX_EPILOGUE
5759 EPILOGUE_2_ARGS
5760ENDPROC iemAImpl_ %+ %2 %+ _u256
5761%endmacro
5762
5763IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5764IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5765
5766
5767;;
5768; Restores the SSE MXCSR register with the original value.
5769;
5770; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5771; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5772; @param 2 Expression giving the address of the FXSTATE of the guest.
5773;
5774; @note Restores the stack pointer.
5775;
5776%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5777 sub xSP, 4
5778 stmxcsr [xSP]
5779 mov T0_32, [xSP]
5780 add xSP, 4
5781 ; Merge the status bits into the original MXCSR value.
5782 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5783 and T0_32, X86_MXCSR_XCPT_FLAGS
5784 or T0_32, T1_32
5785 mov [%1], T0_32
5786
5787 ldmxcsr [xSP]
5788 add xSP, 4
5789%endmacro
5790
5791
5792;;
5793; cvttsd2si instruction - 32-bit variant.
5794;
5795; @param A0 FPU context (FXSTATE or XSAVEAREA).
5796; @param A1 Where to return the MXCSR value.
5797; @param A2 Pointer to the result operand (output).
5798; @param A3 Pointer to the second operand (input).
5799;
5800BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5801 PROLOGUE_4_ARGS
5802 IEMIMPL_SSE_PROLOGUE
5803 SSE_LD_FXSTATE_MXCSR A0
5804
5805 cvttsd2si T0_32, [A3]
5806 mov dword [A2], T0_32
5807
5808 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5809 IEMIMPL_SSE_EPILOGUE
5810 EPILOGUE_4_ARGS
5811ENDPROC iemAImpl_cvttsd2si_i32_r64
5812
5813;;
5814; cvttsd2si instruction - 64-bit variant.
5815;
5816; @param A0 FPU context (FXSTATE or XSAVEAREA).
5817; @param A1 Where to return the MXCSR value.
5818; @param A2 Pointer to the result operand (output).
5819; @param A3 Pointer to the second operand (input).
5820;
5821BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5822 PROLOGUE_4_ARGS
5823 IEMIMPL_SSE_PROLOGUE
5824 SSE_LD_FXSTATE_MXCSR A0
5825
5826 cvttsd2si T0, [A3]
5827 mov qword [A2], T0
5828
5829 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5830 IEMIMPL_SSE_EPILOGUE
5831 EPILOGUE_4_ARGS
5832ENDPROC iemAImpl_cvttsd2si_i64_r64
5833
5834
5835;;
5836; cvtsd2si instruction - 32-bit variant.
5837;
5838; @param A0 FPU context (FXSTATE or XSAVEAREA).
5839; @param A1 Where to return the MXCSR value.
5840; @param A2 Pointer to the result operand (output).
5841; @param A3 Pointer to the second operand (input).
5842;
5843BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5844 PROLOGUE_4_ARGS
5845 IEMIMPL_SSE_PROLOGUE
5846 SSE_LD_FXSTATE_MXCSR A0
5847
5848 cvtsd2si T0_32, [A3]
5849 mov dword [A2], T0_32
5850
5851 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5852 IEMIMPL_SSE_EPILOGUE
5853 EPILOGUE_4_ARGS
5854ENDPROC iemAImpl_cvtsd2si_i32_r64
5855
5856;;
5857; cvtsd2si instruction - 64-bit variant.
5858;
5859; @param A0 FPU context (FXSTATE or XSAVEAREA).
5860; @param A1 Where to return the MXCSR value.
5861; @param A2 Pointer to the result operand (output).
5862; @param A3 Pointer to the second operand (input).
5863;
5864BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5865 PROLOGUE_4_ARGS
5866 IEMIMPL_SSE_PROLOGUE
5867 SSE_LD_FXSTATE_MXCSR A0
5868
5869 cvtsd2si T0, [A3]
5870 mov qword [A2], T0
5871
5872 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5873 IEMIMPL_SSE_EPILOGUE
5874 EPILOGUE_4_ARGS
5875ENDPROC iemAImpl_cvtsd2si_i64_r64
5876
5877
5878;;
5879; cvttss2si instruction - 32-bit variant.
5880;
5881; @param A0 FPU context (FXSTATE or XSAVEAREA).
5882; @param A1 Where to return the MXCSR value.
5883; @param A2 Pointer to the result operand (output).
5884; @param A3 Pointer to the second operand (input).
5885;
5886BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5887 PROLOGUE_4_ARGS
5888 IEMIMPL_SSE_PROLOGUE
5889 SSE_LD_FXSTATE_MXCSR A0
5890
5891 cvttss2si T0_32, [A3]
5892 mov dword [A2], T0_32
5893
5894 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5895 IEMIMPL_SSE_EPILOGUE
5896 EPILOGUE_4_ARGS
5897ENDPROC iemAImpl_cvttss2si_i32_r32
5898
5899;;
5900; cvttss2si instruction - 64-bit variant.
5901;
5902; @param A0 FPU context (FXSTATE or XSAVEAREA).
5903; @param A1 Where to return the MXCSR value.
5904; @param A2 Pointer to the result operand (output).
5905; @param A3 Pointer to the second operand (input).
5906;
5907BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5908 PROLOGUE_4_ARGS
5909 IEMIMPL_SSE_PROLOGUE
5910 SSE_LD_FXSTATE_MXCSR A0
5911
5912 cvttss2si T0, [A3]
5913 mov qword [A2], T0
5914
5915 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5916 IEMIMPL_SSE_EPILOGUE
5917 EPILOGUE_4_ARGS
5918ENDPROC iemAImpl_cvttss2si_i64_r32
5919
5920
5921;;
5922; cvtss2si instruction - 32-bit variant.
5923;
5924; @param A0 FPU context (FXSTATE or XSAVEAREA).
5925; @param A1 Where to return the MXCSR value.
5926; @param A2 Pointer to the result operand (output).
5927; @param A3 Pointer to the second operand (input).
5928;
5929BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5930 PROLOGUE_4_ARGS
5931 IEMIMPL_SSE_PROLOGUE
5932 SSE_LD_FXSTATE_MXCSR A0
5933
5934 cvtss2si T0_32, [A3]
5935 mov dword [A2], T0_32
5936
5937 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5938 IEMIMPL_SSE_EPILOGUE
5939 EPILOGUE_4_ARGS
5940ENDPROC iemAImpl_cvtss2si_i32_r32
5941
5942;;
5943; cvtss2si instruction - 64-bit variant.
5944;
5945; @param A0 FPU context (FXSTATE or XSAVEAREA).
5946; @param A1 Where to return the MXCSR value.
5947; @param A2 Pointer to the result operand (output).
5948; @param A3 Pointer to the second operand (input).
5949;
5950BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5951 PROLOGUE_4_ARGS
5952 IEMIMPL_SSE_PROLOGUE
5953 SSE_LD_FXSTATE_MXCSR A0
5954
5955 cvtss2si T0, [A3]
5956 mov qword [A2], T0
5957
5958 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5959 IEMIMPL_SSE_EPILOGUE
5960 EPILOGUE_4_ARGS
5961ENDPROC iemAImpl_cvtss2si_i64_r32
5962
5963
5964;;
5965; cvtsi2ss instruction - 32-bit variant.
5966;
5967; @param A0 FPU context (FXSTATE or XSAVEAREA).
5968; @param A1 Where to return the MXCSR value.
5969; @param A2 Pointer to the result operand (output).
5970; @param A3 Pointer to the second operand (input).
5971;
5972BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5973 PROLOGUE_4_ARGS
5974 IEMIMPL_SSE_PROLOGUE
5975 SSE_LD_FXSTATE_MXCSR A0
5976
5977 cvtsi2ss xmm0, dword [A3]
5978 movd dword [A2], xmm0
5979
5980 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5981 IEMIMPL_SSE_EPILOGUE
5982 EPILOGUE_4_ARGS
5983ENDPROC iemAImpl_cvtsi2ss_r32_i32
5984
5985;;
5986; cvtsi2ss instruction - 64-bit variant.
5987;
5988; @param A0 FPU context (FXSTATE or XSAVEAREA).
5989; @param A1 Where to return the MXCSR value.
5990; @param A2 Pointer to the result operand (output).
5991; @param A3 Pointer to the second operand (input).
5992;
5993BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5994 PROLOGUE_4_ARGS
5995 IEMIMPL_SSE_PROLOGUE
5996 SSE_LD_FXSTATE_MXCSR A0
5997
5998 cvtsi2ss xmm0, qword [A3]
5999 movd dword [A2], xmm0
6000
6001 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6002 IEMIMPL_SSE_EPILOGUE
6003 EPILOGUE_4_ARGS
6004ENDPROC iemAImpl_cvtsi2ss_r32_i64
6005
6006
6007;;
6008; cvtsi2sd instruction - 32-bit variant.
6009;
6010; @param A0 FPU context (FXSTATE or XSAVEAREA).
6011; @param A1 Where to return the MXCSR value.
6012; @param A2 Pointer to the result operand (output).
6013; @param A3 Pointer to the second operand (input).
6014;
6015BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6016 PROLOGUE_4_ARGS
6017 IEMIMPL_SSE_PROLOGUE
6018 SSE_LD_FXSTATE_MXCSR A0
6019
6020 cvtsi2sd xmm0, dword [A3]
6021 movq [A2], xmm0
6022
6023 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6024 IEMIMPL_SSE_EPILOGUE
6025 EPILOGUE_4_ARGS
6026ENDPROC iemAImpl_cvtsi2sd_r64_i32
6027
6028;;
6029; cvtsi2sd instruction - 64-bit variant.
6030;
6031; @param A0 FPU context (FXSTATE or XSAVEAREA).
6032; @param A1 Where to return the MXCSR value.
6033; @param A2 Pointer to the result operand (output).
6034; @param A3 Pointer to the second operand (input).
6035;
6036BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6037 PROLOGUE_4_ARGS
6038 IEMIMPL_SSE_PROLOGUE
6039 SSE_LD_FXSTATE_MXCSR A0
6040
6041 cvtsi2sd xmm0, qword [A3]
6042 movq [A2], xmm0
6043
6044 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6045 IEMIMPL_SSE_EPILOGUE
6046 EPILOGUE_4_ARGS
6047ENDPROC iemAImpl_cvtsi2sd_r64_i64
6048
6049
6050;;
6051; Initialize the SSE MXCSR register using the guest value partially to
6052; account for rounding mode.
6053;
6054; @uses 4 bytes of stack to save the original value, T0.
6055; @param 1 Expression giving the address of the MXCSR register of the guest.
6056;
6057%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6058 sub xSP, 4
6059
6060 stmxcsr [xSP]
6061 mov T0_32, [%1]
6062 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6063 or T0_32, X86_MXCSR_XCPT_MASK
6064 sub xSP, 4
6065 mov [xSP], T0_32
6066 ldmxcsr [xSP]
6067 add xSP, 4
6068%endmacro
6069
6070
6071;;
6072; Restores the SSE MXCSR register with the original value.
6073;
6074; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6075; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6076;
6077; @note Restores the stack pointer.
6078;
6079%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6080 sub xSP, 4
6081 stmxcsr [xSP]
6082 mov T0_32, [xSP]
6083 add xSP, 4
6084 ; Merge the status bits into the original MXCSR value.
6085 mov T1_32, [%1]
6086 and T0_32, X86_MXCSR_XCPT_FLAGS
6087 or T0_32, T1_32
6088 mov [%1], T0_32
6089
6090 ldmxcsr [xSP]
6091 add xSP, 4
6092%endmacro
6093
6094
6095;
6096; UCOMISS (SSE)
6097;
6098; @param A0 Pointer to the MXCSR value (input/output).
6099; @param A1 Pointer to the EFLAGS value (input/output).
6100; @param A2 Pointer to the first source operand (aka readonly destination).
6101; @param A3 Pointer to the second source operand.
6102;
6103BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6104 PROLOGUE_4_ARGS
6105 IEMIMPL_SSE_PROLOGUE
6106 SSE_LD_FXSTATE_MXCSR_ONLY A0
6107
6108 movdqu xmm0, [A2]
6109 movdqu xmm1, [A3]
6110 ucomiss xmm0, xmm1
6111 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6112
6113 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6114 IEMIMPL_SSE_EPILOGUE
6115 EPILOGUE_4_ARGS
6116ENDPROC iemAImpl_ucomiss_u128
6117
6118BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6119 PROLOGUE_4_ARGS
6120 IEMIMPL_SSE_PROLOGUE
6121 SSE_LD_FXSTATE_MXCSR_ONLY A0
6122
6123 movdqu xmm0, [A2]
6124 movdqu xmm1, [A3]
6125 vucomiss xmm0, xmm1
6126 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6127
6128 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6129 IEMIMPL_SSE_EPILOGUE
6130 EPILOGUE_4_ARGS
6131ENDPROC iemAImpl_vucomiss_u128
6132
6133
6134;
6135; UCOMISD (SSE)
6136;
6137; @param A0 Pointer to the MXCSR value (input/output).
6138; @param A1 Pointer to the EFLAGS value (input/output).
6139; @param A2 Pointer to the first source operand (aka readonly destination).
6140; @param A3 Pointer to the second source operand.
6141;
6142BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6143 PROLOGUE_4_ARGS
6144 IEMIMPL_SSE_PROLOGUE
6145 SSE_LD_FXSTATE_MXCSR_ONLY A0
6146
6147 movdqu xmm0, [A2]
6148 movdqu xmm1, [A3]
6149 ucomisd xmm0, xmm1
6150 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6151
6152 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6153 IEMIMPL_SSE_EPILOGUE
6154 EPILOGUE_4_ARGS
6155ENDPROC iemAImpl_ucomisd_u128
6156
6157BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6158 PROLOGUE_4_ARGS
6159 IEMIMPL_SSE_PROLOGUE
6160 SSE_LD_FXSTATE_MXCSR_ONLY A0
6161
6162 movdqu xmm0, [A2]
6163 movdqu xmm1, [A3]
6164 vucomisd xmm0, xmm1
6165 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6166
6167 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6168 IEMIMPL_SSE_EPILOGUE
6169 EPILOGUE_4_ARGS
6170ENDPROC iemAImpl_vucomisd_u128
6171
6172;
6173; COMISS (SSE)
6174;
6175; @param A0 Pointer to the MXCSR value (input/output).
6176; @param A1 Pointer to the EFLAGS value (input/output).
6177; @param A2 Pointer to the first source operand (aka readonly destination).
6178; @param A3 Pointer to the second source operand.
6179;
6180BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6181 PROLOGUE_4_ARGS
6182 IEMIMPL_SSE_PROLOGUE
6183 SSE_LD_FXSTATE_MXCSR_ONLY A0
6184
6185 movdqu xmm0, [A2]
6186 movdqu xmm1, [A3]
6187 comiss xmm0, xmm1
6188 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6189
6190 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6191 IEMIMPL_SSE_EPILOGUE
6192 EPILOGUE_4_ARGS
6193ENDPROC iemAImpl_comiss_u128
6194
6195BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6196 PROLOGUE_4_ARGS
6197 IEMIMPL_SSE_PROLOGUE
6198 SSE_LD_FXSTATE_MXCSR_ONLY A0
6199
6200 movdqu xmm0, [A2]
6201 movdqu xmm1, [A3]
6202 vcomiss xmm0, xmm1
6203 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6204
6205 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6206 IEMIMPL_SSE_EPILOGUE
6207 EPILOGUE_4_ARGS
6208ENDPROC iemAImpl_vcomiss_u128
6209
6210
6211;
6212; COMISD (SSE)
6213;
6214; @param A0 Pointer to the MXCSR value (input/output).
6215; @param A1 Pointer to the EFLAGS value (input/output).
6216; @param A2 Pointer to the first source operand (aka readonly destination).
6217; @param A3 Pointer to the second source operand.
6218;
6219BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6220 PROLOGUE_4_ARGS
6221 IEMIMPL_SSE_PROLOGUE
6222 SSE_LD_FXSTATE_MXCSR_ONLY A0
6223
6224 movdqu xmm0, [A2]
6225 movdqu xmm1, [A3]
6226 comisd xmm0, xmm1
6227 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6228
6229 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6230 IEMIMPL_SSE_EPILOGUE
6231 EPILOGUE_4_ARGS
6232ENDPROC iemAImpl_comisd_u128
6233
6234BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6235 PROLOGUE_4_ARGS
6236 IEMIMPL_SSE_PROLOGUE
6237 SSE_LD_FXSTATE_MXCSR_ONLY A0
6238
6239 movdqu xmm0, [A2]
6240 movdqu xmm1, [A3]
6241 vcomisd xmm0, xmm1
6242 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6243
6244 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6245 IEMIMPL_SSE_EPILOGUE
6246 EPILOGUE_4_ARGS
6247ENDPROC iemAImpl_vcomisd_u128
6248
6249
6250;;
6251; Need to move this as well somewhere better?
6252;
6253struc IEMMEDIAF2XMMSRC
6254 .uSrc1 resd 4
6255 .uSrc2 resd 4
6256endstruc
6257
6258
6259;
6260; CMPPS (SSE)
6261;
6262; @param A0 Pointer to the MXCSR value (input/output).
6263; @param A1 Pointer to the first media register size operand (output).
6264; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6265; @param A3 The 8-bit immediate (input).
6266;
6267BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6268 PROLOGUE_4_ARGS
6269 IEMIMPL_SSE_PROLOGUE
6270 SSE_LD_FXSTATE_MXCSR_ONLY A0
6271
6272 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6273 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6274 lea T1, [.imm0 xWrtRIP]
6275 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6276 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6277 %else
6278 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6279 %endif
6280 lea T1, [T1 + T0]
6281 IBT_NOTRACK
6282 call T1
6283 movdqu [A1], xmm0
6284
6285 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6286 IEMIMPL_SSE_EPILOGUE
6287 EPILOGUE_4_ARGS
6288 %assign bImm 0
6289 %rep 256
6290.imm %+ bImm:
6291 IBT_ENDBRxx_WITHOUT_NOTRACK
6292 cmpps xmm0, xmm1, bImm
6293 ret
6294 %assign bImm bImm + 1
6295 %endrep
6296.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6297ENDPROC iemAImpl_cmpps_u128
6298
6299;;
6300; SSE instructions with 8-bit immediates of the form
6301; xxx xmm1, xmm2, imm8.
6302; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6303; register.
6304;
6305; @param 1 The instruction name.
6306;
6307; @param A0 Pointer to the MXCSR value (input/output).
6308; @param A1 Pointer to the first media register size operand (output).
6309; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6310; @param A3 The 8-bit immediate (input).
6311;
6312%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6313BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6314 PROLOGUE_4_ARGS
6315 IEMIMPL_SSE_PROLOGUE
6316 SSE_LD_FXSTATE_MXCSR_ONLY A0
6317
6318 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6319 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6320 lea T1, [.imm0 xWrtRIP]
6321 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6322 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6323 %else
6324 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6325 %endif
6326 lea T1, [T1 + T0*2]
6327 IBT_NOTRACK
6328 call T1
6329 movdqu [A1], xmm0
6330
6331 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6332 IEMIMPL_SSE_EPILOGUE
6333 EPILOGUE_4_ARGS
6334 %assign bImm 0
6335 %rep 256
6336.imm %+ bImm:
6337 IBT_ENDBRxx_WITHOUT_NOTRACK
6338 %1 xmm0, xmm1, bImm
6339 ret
6340 %assign bImm bImm + 1
6341 %endrep
6342.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6343ENDPROC iemAImpl_ %+ %1 %+ _u128
6344%endmacro
6345
6346IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6347IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6348IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6349
6350;;
6351; SSE instructions with 8-bit immediates of the form
6352; xxx xmm1, xmm2, imm8.
6353; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6354; register.
6355;
6356; @param 1 The instruction name.
6357;
6358; @param A0 Pointer to the MXCSR value (input/output).
6359; @param A1 Pointer to the first media register size operand (output).
6360; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6361; @param A3 The 8-bit immediate (input).
6362;
6363%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6364BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6365 PROLOGUE_4_ARGS
6366 IEMIMPL_SSE_PROLOGUE
6367 SSE_LD_FXSTATE_MXCSR_ONLY A0
6368
6369 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6370 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6371 lea T1, [.imm0 xWrtRIP]
6372 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6373 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6374 lea T1, [T1 + T0*4]
6375 %else
6376 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6377 %endif
6378 IBT_NOTRACK
6379 call T1
6380 movdqu [A1], xmm0
6381
6382 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6383 IEMIMPL_SSE_EPILOGUE
6384 EPILOGUE_4_ARGS
6385 %assign bImm 0
6386 %rep 256
6387.imm %+ bImm:
6388 IBT_ENDBRxx_WITHOUT_NOTRACK
6389 %1 xmm0, xmm1, bImm
6390 ret
6391 int3
6392 %assign bImm bImm + 1
6393 %endrep
6394.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6395ENDPROC iemAImpl_ %+ %1 %+ _u128
6396%endmacro
6397
6398IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6399IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6400IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6401IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6402IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6403IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6404
6405
6406;;
6407; SSE instructions of the form
6408; xxx mm, xmm.
6409; and we need to load and save the MXCSR register.
6410;
6411; @param 1 The instruction name.
6412;
6413; @param A0 Pointer to the MXCSR value (input/output).
6414; @param A1 Pointer to the first MMX register sized operand (output).
6415; @param A2 Pointer to the media register sized operand (input).
6416;
6417%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6418BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6419 PROLOGUE_3_ARGS
6420 IEMIMPL_SSE_PROLOGUE
6421 SSE_LD_FXSTATE_MXCSR_ONLY A0
6422
6423 movdqu xmm0, [A2]
6424 %1 mm0, xmm0
6425 movq [A1], mm0
6426
6427 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6428 IEMIMPL_SSE_EPILOGUE
6429 EPILOGUE_3_ARGS
6430ENDPROC iemAImpl_ %+ %1 %+ _u128
6431%endmacro
6432
6433IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6434IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6435
6436;;
6437; SSE instructions of the form
6438; xxx xmm, xmm/m64.
6439; and we need to load and save the MXCSR register.
6440;
6441; @param 1 The instruction name.
6442;
6443; @param A0 Pointer to the MXCSR value (input/output).
6444; @param A1 Pointer to the first media register sized operand (input/output).
6445; @param A2 The 64bit source value from a MMX media register (input)
6446;
6447%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6448BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6449 PROLOGUE_3_ARGS
6450 IEMIMPL_SSE_PROLOGUE
6451 SSE_LD_FXSTATE_MXCSR_ONLY A0
6452
6453 movdqu xmm0, [A1]
6454 movq mm0, A2
6455 %1 xmm0, mm0
6456 movdqu [A1], xmm0
6457
6458 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6459 IEMIMPL_SSE_EPILOGUE
6460 EPILOGUE_3_ARGS
6461ENDPROC iemAImpl_ %+ %1 %+ _u128
6462%endmacro
6463
6464IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6465IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6466
6467;;
6468; SSE instructions of the form
6469; xxx mm, xmm/m64.
6470; and we need to load and save the MXCSR register.
6471;
6472; @param 1 The instruction name.
6473;
6474; @param A0 Pointer to the MXCSR value (input/output).
6475; @param A1 Pointer to the first MMX media register sized operand (output).
6476; @param A2 The 64bit source value (input).
6477;
6478%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6479BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6480 PROLOGUE_3_ARGS
6481 IEMIMPL_SSE_PROLOGUE
6482 SSE_LD_FXSTATE_MXCSR_ONLY A0
6483
6484 movq xmm0, A2
6485 %1 mm0, xmm0
6486 movq [A1], mm0
6487
6488 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6489 IEMIMPL_SSE_EPILOGUE
6490 EPILOGUE_3_ARGS
6491ENDPROC iemAImpl_ %+ %1 %+ _u128
6492%endmacro
6493
6494IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6495IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6496
6497;
6498; All forms of RDRAND and RDSEED
6499;
6500; @param A0 Pointer to the destination operand.
6501; @param A1 Pointer to the EFLAGS value (input/output).
6502;
6503%macro IEMIMPL_RDRAND_RDSEED 3
6504BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6505 PROLOGUE_2_ARGS
6506
6507 %1 %2
6508 mov [A0], %2
6509 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6510
6511 EPILOGUE_2_ARGS
6512ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6513%endmacro
6514
6515IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6516IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6517IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6518IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6519IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6520IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6521
6522
6523;;
6524; sha1rnds4 xmm1, xmm2, imm8.
6525;
6526; @param 1 The instruction name.
6527;
6528; @param A0 Pointer to the first media register size operand (input/output).
6529; @param A1 Pointer to the second source media register size operand (input).
6530; @param A2 The 8-bit immediate
6531;
6532BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6533 PROLOGUE_3_ARGS
6534 IEMIMPL_SSE_PROLOGUE
6535
6536 movdqu xmm0, [A0]
6537 movdqu xmm1, [A1]
6538 lea T1, [.imm0 xWrtRIP]
6539 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6540 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6541 %else
6542 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6543 %endif
6544 lea T1, [T1 + T0*2]
6545 IBT_NOTRACK
6546 call T1
6547 movdqu [A0], xmm0
6548
6549 IEMIMPL_SSE_EPILOGUE
6550 EPILOGUE_3_ARGS
6551 %assign bImm 0
6552 %rep 256
6553.imm %+ bImm:
6554 IBT_ENDBRxx_WITHOUT_NOTRACK
6555 sha1rnds4 xmm0, xmm1, bImm
6556 ret
6557 %assign bImm bImm + 1
6558 %endrep
6559.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6560ENDPROC iemAImpl_sha1rnds4_u128
6561
6562
6563;;
6564; sha256rnds2 xmm1, xmm2, <XMM0>.
6565;
6566; @param 1 The instruction name.
6567;
6568; @param A0 Pointer to the first media register size operand (input/output).
6569; @param A1 Pointer to the second source media register size operand (input).
6570; @param A2 Pointer to the implicit XMM0 constants (input).
6571;
6572BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6573 PROLOGUE_3_ARGS
6574 IEMIMPL_SSE_PROLOGUE
6575
6576 movdqu xmm0, [A2]
6577 movdqu xmm1, [A0]
6578 movdqu xmm2, [A1]
6579 sha256rnds2 xmm1, xmm2
6580 movdqu [A0], xmm1
6581
6582 IEMIMPL_SSE_EPILOGUE
6583 EPILOGUE_3_ARGS
6584ENDPROC iemAImpl_sha256rnds2_u128
6585
6586
6587;
6588; 32-bit forms of ADCX and ADOX
6589;
6590; @param A0 Pointer to the destination operand (input/output).
6591; @param A1 Pointer to the EFLAGS value (input/output).
6592; @param A2 32-bit source operand 1 (input).
6593;
6594%macro IEMIMPL_ADX_32 2
6595BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6596 PROLOGUE_4_ARGS
6597
6598 IEM_LOAD_FLAGS A1, %2, 0
6599 %1 A2_32, [A0]
6600 mov [A0], A2_32
6601 IEM_SAVE_FLAGS A1, %2, 0
6602
6603 EPILOGUE_4_ARGS
6604ENDPROC iemAImpl_ %+ %1 %+ _u32
6605%endmacro
6606
6607;
6608; 64-bit forms of ADCX and ADOX
6609;
6610; @param A0 Pointer to the destination operand (input/output).
6611; @param A1 Pointer to the EFLAGS value (input/output).
6612; @param A2 64-bit source operand 1 (input).
6613;
6614%macro IEMIMPL_ADX_64 2
6615BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6616 PROLOGUE_4_ARGS
6617
6618 IEM_LOAD_FLAGS A1, %2, 0
6619 %1 A2, [A0]
6620 mov [A0], A2
6621 IEM_SAVE_FLAGS A1, %2, 0
6622
6623 EPILOGUE_4_ARGS
6624ENDPROC iemAImpl_ %+ %1 %+ _u64
6625%endmacro
6626
6627IEMIMPL_ADX_32 adcx, X86_EFL_CF
6628IEMIMPL_ADX_64 adcx, X86_EFL_CF
6629
6630IEMIMPL_ADX_32 adox, X86_EFL_OF
6631IEMIMPL_ADX_64 adox, X86_EFL_OF
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette