VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 100651

最後變更 在這個檔案從100651是 100607,由 vboxsync 提交於 20 月 前

VMM/IEM: Implement vperm2f128/vperm2i128 instruction emulations, bugref:9898

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 187.8 KB
 
1; $Id: IEMAllAImpl.asm 100607 2023-07-17 16:38:48Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.alldomusa.eu.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90 IBT_ENDBRxx
91%endmacro
92
93
94;
95; We employ some macro assembly here to hid the calling convention differences.
96;
97%ifdef RT_ARCH_AMD64
98 %macro PROLOGUE_1_ARGS 0
99 %endmacro
100 %macro EPILOGUE_1_ARGS 0
101 ret
102 %endmacro
103 %macro EPILOGUE_1_ARGS_EX 0
104 ret
105 %endmacro
106
107 %macro PROLOGUE_2_ARGS 0
108 %endmacro
109 %macro EPILOGUE_2_ARGS 0
110 ret
111 %endmacro
112 %macro EPILOGUE_2_ARGS_EX 1
113 ret
114 %endmacro
115
116 %macro PROLOGUE_3_ARGS 0
117 %endmacro
118 %macro EPILOGUE_3_ARGS 0
119 ret
120 %endmacro
121 %macro EPILOGUE_3_ARGS_EX 1
122 ret
123 %endmacro
124
125 %macro PROLOGUE_4_ARGS 0
126 %endmacro
127 %macro EPILOGUE_4_ARGS 0
128 ret
129 %endmacro
130 %macro EPILOGUE_4_ARGS_EX 1
131 ret
132 %endmacro
133
134 %ifdef ASM_CALL64_GCC
135 %define A0 rdi
136 %define A0_32 edi
137 %define A0_16 di
138 %define A0_8 dil
139
140 %define A1 rsi
141 %define A1_32 esi
142 %define A1_16 si
143 %define A1_8 sil
144
145 %define A2 rdx
146 %define A2_32 edx
147 %define A2_16 dx
148 %define A2_8 dl
149
150 %define A3 rcx
151 %define A3_32 ecx
152 %define A3_16 cx
153 %endif
154
155 %ifdef ASM_CALL64_MSC
156 %define A0 rcx
157 %define A0_32 ecx
158 %define A0_16 cx
159 %define A0_8 cl
160
161 %define A1 rdx
162 %define A1_32 edx
163 %define A1_16 dx
164 %define A1_8 dl
165
166 %define A2 r8
167 %define A2_32 r8d
168 %define A2_16 r8w
169 %define A2_8 r8b
170
171 %define A3 r9
172 %define A3_32 r9d
173 %define A3_16 r9w
174 %endif
175
176 %define T0 rax
177 %define T0_32 eax
178 %define T0_16 ax
179 %define T0_8 al
180
181 %define T1 r11
182 %define T1_32 r11d
183 %define T1_16 r11w
184 %define T1_8 r11b
185
186 %define T2 r10 ; only AMD64
187 %define T2_32 r10d
188 %define T2_16 r10w
189 %define T2_8 r10b
190
191%else
192 ; x86
193 %macro PROLOGUE_1_ARGS 0
194 push edi
195 %endmacro
196 %macro EPILOGUE_1_ARGS 0
197 pop edi
198 ret 0
199 %endmacro
200 %macro EPILOGUE_1_ARGS_EX 1
201 pop edi
202 ret %1
203 %endmacro
204
205 %macro PROLOGUE_2_ARGS 0
206 push edi
207 %endmacro
208 %macro EPILOGUE_2_ARGS 0
209 pop edi
210 ret 0
211 %endmacro
212 %macro EPILOGUE_2_ARGS_EX 1
213 pop edi
214 ret %1
215 %endmacro
216
217 %macro PROLOGUE_3_ARGS 0
218 push ebx
219 mov ebx, [esp + 4 + 4]
220 push edi
221 %endmacro
222 %macro EPILOGUE_3_ARGS_EX 1
223 %if (%1) < 4
224 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
225 %endif
226 pop edi
227 pop ebx
228 ret %1
229 %endmacro
230 %macro EPILOGUE_3_ARGS 0
231 EPILOGUE_3_ARGS_EX 4
232 %endmacro
233
234 %macro PROLOGUE_4_ARGS 0
235 push ebx
236 push edi
237 push esi
238 mov ebx, [esp + 12 + 4 + 0]
239 mov esi, [esp + 12 + 4 + 4]
240 %endmacro
241 %macro EPILOGUE_4_ARGS_EX 1
242 %if (%1) < 8
243 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
244 %endif
245 pop esi
246 pop edi
247 pop ebx
248 ret %1
249 %endmacro
250 %macro EPILOGUE_4_ARGS 0
251 EPILOGUE_4_ARGS_EX 8
252 %endmacro
253
254 %define A0 ecx
255 %define A0_32 ecx
256 %define A0_16 cx
257 %define A0_8 cl
258
259 %define A1 edx
260 %define A1_32 edx
261 %define A1_16 dx
262 %define A1_8 dl
263
264 %define A2 ebx
265 %define A2_32 ebx
266 %define A2_16 bx
267 %define A2_8 bl
268
269 %define A3 esi
270 %define A3_32 esi
271 %define A3_16 si
272
273 %define T0 eax
274 %define T0_32 eax
275 %define T0_16 ax
276 %define T0_8 al
277
278 %define T1 edi
279 %define T1_32 edi
280 %define T1_16 di
281%endif
282
283
284;;
285; Load the relevant flags from [%1] if there are undefined flags (%3).
286;
287; @remarks Clobbers T0, stack. Changes EFLAGS.
288; @param A2 The register pointing to the flags.
289; @param 1 The parameter (A0..A3) pointing to the eflags.
290; @param 2 The set of modified flags.
291; @param 3 The set of undefined flags.
292;
293%macro IEM_MAYBE_LOAD_FLAGS 3
294 ;%if (%3) != 0
295 pushf ; store current flags
296 mov T0_32, [%1] ; load the guest flags
297 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
298 and T0_32, (%2 | %3) ; select the modified and undefined flags.
299 or [xSP], T0 ; merge guest flags with host flags.
300 popf ; load the mixed flags.
301 ;%endif
302%endmacro
303
304;;
305; Load the relevant flags from [%1].
306;
307; @remarks Clobbers T0, stack. Changes EFLAGS.
308; @param A2 The register pointing to the flags.
309; @param 1 The parameter (A0..A3) pointing to the eflags.
310; @param 2 The set of flags to load.
311; @param 3 The set of undefined flags.
312;
313%macro IEM_LOAD_FLAGS 3
314 pushf ; store current flags
315 mov T0_32, [%1] ; load the guest flags
316 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
317 and T0_32, (%2 | %3) ; select the modified and undefined flags.
318 or [xSP], T0 ; merge guest flags with host flags.
319 popf ; load the mixed flags.
320%endmacro
321
322;;
323; Update the flag.
324;
325; @remarks Clobbers T0, T1, stack.
326; @param 1 The register pointing to the EFLAGS.
327; @param 2 The mask of modified flags to save.
328; @param 3 The mask of undefined flags to (maybe) save.
329;
330%macro IEM_SAVE_FLAGS 3
331 %if (%2 | %3) != 0
332 pushf
333 pop T1
334 mov T0_32, [%1] ; flags
335 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
336 and T1_32, (%2 | %3) ; select the modified and undefined flags.
337 or T0_32, T1_32 ; combine the flags.
338 mov [%1], T0_32 ; save the flags.
339 %endif
340%endmacro
341
342;;
343; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
344;
345; @remarks Clobbers T0, T1, stack.
346; @param 1 The register pointing to the EFLAGS.
347; @param 2 The mask of modified flags to save.
348; @param 3 Mask of additional flags to always clear
349; @param 4 Mask of additional flags to always set.
350;
351%macro IEM_SAVE_AND_ADJUST_FLAGS 4
352 %if (%2 | %3 | %4) != 0
353 pushf
354 pop T1
355 mov T0_32, [%1] ; load flags.
356 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
357 and T1_32, (%2) ; select the modified flags.
358 or T0_32, T1_32 ; combine the flags.
359 %if (%4) != 0
360 or T0_32, %4 ; add the always set flags.
361 %endif
362 mov [%1], T0_32 ; save the result.
363 %endif
364%endmacro
365
366;;
367; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
368; signed input (%4[%5]) and parity index (%6).
369;
370; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
371; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
372; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
373;
374; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
375; @param 1 The register pointing to the EFLAGS.
376; @param 2 The mask of modified flags to save.
377; @param 3 Mask of additional flags to always clear
378; @param 4 The result register to set SF by.
379; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
380; @param 6 The (full) register containing the parity table index. Will be modified!
381
382%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
383 %ifdef RT_ARCH_AMD64
384 pushf
385 pop T2
386 %else
387 push T0
388 pushf
389 pop T0
390 %endif
391 mov T1_32, [%1] ; load flags.
392 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
393 %ifdef RT_ARCH_AMD64
394 and T2_32, (%2) ; select the modified flags.
395 or T1_32, T2_32 ; combine the flags.
396 %else
397 and T0_32, (%2) ; select the modified flags.
398 or T1_32, T0_32 ; combine the flags.
399 pop T0
400 %endif
401
402 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
403 bt %4, %5 - 1
404 jnc %%sf_clear
405 or T1_32, X86_EFL_SF
406 %%sf_clear:
407
408 ; Parity last.
409 and %6, 0xff
410 %ifdef RT_ARCH_AMD64
411 lea T2, [NAME(g_afParity) xWrtRIP]
412 or T1_8, [T2 + %6]
413 %else
414 or T1_8, [NAME(g_afParity) + %6]
415 %endif
416
417 mov [%1], T1_32 ; save the result.
418%endmacro
419
420;;
421; Calculates the new EFLAGS using fixed clear and set bit masks.
422;
423; @remarks Clobbers T0.
424; @param 1 The register pointing to the EFLAGS.
425; @param 2 Mask of additional flags to always clear
426; @param 3 Mask of additional flags to always set.
427;
428%macro IEM_ADJUST_FLAGS 3
429 %if (%2 | %3) != 0
430 mov T0_32, [%1] ; Load flags.
431 %if (%2) != 0
432 and T0_32, ~(%2) ; Remove the always cleared flags.
433 %endif
434 %if (%3) != 0
435 or T0_32, %3 ; Add the always set flags.
436 %endif
437 mov [%1], T0_32 ; Save the result.
438 %endif
439%endmacro
440
441;;
442; Calculates the new EFLAGS using fixed clear and set bit masks.
443;
444; @remarks Clobbers T0, %4, EFLAGS.
445; @param 1 The register pointing to the EFLAGS.
446; @param 2 Mask of additional flags to always clear
447; @param 3 Mask of additional flags to always set.
448; @param 4 The (full) register containing the parity table index. Will be modified!
449;
450%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
451 mov T0_32, [%1] ; Load flags.
452 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
453 %if (%3) != 0
454 or T0_32, %3 ; Add the always set flags.
455 %endif
456 and %4, 0xff
457 %ifdef RT_ARCH_AMD64
458 lea T2, [NAME(g_afParity) xWrtRIP]
459 or T0_8, [T2 + %4]
460 %else
461 or T0_8, [NAME(g_afParity) + %4]
462 %endif
463 mov [%1], T0_32 ; Save the result.
464%endmacro
465
466
467;;
468; Checks that the size expression %1 matches %2 adjusted according to
469; RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK and for 256 entries.
470; @param 1 The jump array size assembly expression.
471; @param 2 The size without accounting for the IBT_ENDBRxx_WITHOUT_NOTRACK instruction.
472;
473%macro IEMCHECK_256_JUMP_ARRAY_SIZE 2
474 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
475 dw (0xffff - %2 - 256*4) + %1 ; will cause warning if entries are too big.
476 dw (0xffff + %2 + 256*4) - %1 ; will cause warning if entries are too small.
477 %else
478 dw (0xffff - %2) + %1 ; will cause warning if entries are too big.
479 dw (0xffff + %2) - %1 ; will cause warning if entries are too small.
480 %endif
481%endmacro
482
483
484;*********************************************************************************************************************************
485;* External Symbols *
486;*********************************************************************************************************************************
487extern NAME(g_afParity)
488
489
490;;
491; Macro for implementing a binary operator.
492;
493; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
494; variants, except on 32-bit system where the 64-bit accesses requires hand
495; coding.
496;
497; All the functions takes a pointer to the destination memory operand in A0,
498; the source register operand in A1 and a pointer to eflags in A2.
499;
500; @param 1 The instruction mnemonic.
501; @param 2 Non-zero if there should be a locked version.
502; @param 3 The modified flags.
503; @param 4 The undefined flags.
504;
505%macro IEMIMPL_BIN_OP 4
506BEGINCODE
507BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
508 PROLOGUE_3_ARGS
509 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
510 %1 byte [A0], A1_8
511 IEM_SAVE_FLAGS A2, %3, %4
512 EPILOGUE_3_ARGS
513ENDPROC iemAImpl_ %+ %1 %+ _u8
514
515BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
516 PROLOGUE_3_ARGS
517 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
518 %1 word [A0], A1_16
519 IEM_SAVE_FLAGS A2, %3, %4
520 EPILOGUE_3_ARGS
521ENDPROC iemAImpl_ %+ %1 %+ _u16
522
523BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
524 PROLOGUE_3_ARGS
525 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
526 %1 dword [A0], A1_32
527 IEM_SAVE_FLAGS A2, %3, %4
528 EPILOGUE_3_ARGS
529ENDPROC iemAImpl_ %+ %1 %+ _u32
530
531 %ifdef RT_ARCH_AMD64
532BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
533 PROLOGUE_3_ARGS
534 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
535 %1 qword [A0], A1
536 IEM_SAVE_FLAGS A2, %3, %4
537 EPILOGUE_3_ARGS_EX 8
538ENDPROC iemAImpl_ %+ %1 %+ _u64
539 %endif ; RT_ARCH_AMD64
540
541 %if %2 != 0 ; locked versions requested?
542
543BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
544 PROLOGUE_3_ARGS
545 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
546 lock %1 byte [A0], A1_8
547 IEM_SAVE_FLAGS A2, %3, %4
548 EPILOGUE_3_ARGS
549ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
550
551BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
552 PROLOGUE_3_ARGS
553 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
554 lock %1 word [A0], A1_16
555 IEM_SAVE_FLAGS A2, %3, %4
556 EPILOGUE_3_ARGS
557ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
558
559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
560 PROLOGUE_3_ARGS
561 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
562 lock %1 dword [A0], A1_32
563 IEM_SAVE_FLAGS A2, %3, %4
564 EPILOGUE_3_ARGS
565ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
566
567 %ifdef RT_ARCH_AMD64
568BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
569 PROLOGUE_3_ARGS
570 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
571 lock %1 qword [A0], A1
572 IEM_SAVE_FLAGS A2, %3, %4
573 EPILOGUE_3_ARGS_EX 8
574ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
575 %endif ; RT_ARCH_AMD64
576 %endif ; locked
577%endmacro
578
579; instr,lock, modified-flags, undefined flags
580IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
581IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
582IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
583IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
584IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
585IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
586IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
587IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
588IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
589
590
591;;
592; Macro for implementing a binary operator, VEX variant with separate input/output.
593;
594; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
595; where the 64-bit accesses requires hand coding.
596;
597; All the functions takes a pointer to the destination memory operand in A0,
598; the first source register operand in A1, the second source register operand
599; in A2 and a pointer to eflags in A3.
600;
601; @param 1 The instruction mnemonic.
602; @param 2 The modified flags.
603; @param 3 The undefined flags.
604;
605%macro IEMIMPL_VEX_BIN_OP 3
606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
607 PROLOGUE_4_ARGS
608 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
609 %1 T0_32, A1_32, A2_32
610 mov [A0], T0_32
611 IEM_SAVE_FLAGS A3, %2, %3
612 EPILOGUE_4_ARGS
613ENDPROC iemAImpl_ %+ %1 %+ _u32
614
615 %ifdef RT_ARCH_AMD64
616BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
617 PROLOGUE_4_ARGS
618 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
619 %1 T0, A1, A2
620 mov [A0], T0
621 IEM_SAVE_FLAGS A3, %2, %3
622 EPILOGUE_4_ARGS
623ENDPROC iemAImpl_ %+ %1 %+ _u64
624 %endif ; RT_ARCH_AMD64
625%endmacro
626
627; instr, modified-flags, undefined-flags
628IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
629IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
630IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
631
632;;
633; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
634;
635; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
636; where the 64-bit accesses requires hand coding.
637;
638; All the functions takes a pointer to the destination memory operand in A0,
639; the source register operand in A1 and a pointer to eflags in A2.
640;
641; @param 1 The instruction mnemonic.
642; @param 2 The modified flags.
643; @param 3 The undefined flags.
644;
645%macro IEMIMPL_VEX_BIN_OP_2 3
646BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
647 PROLOGUE_4_ARGS
648 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
649 mov T0_32, [A0]
650 %1 T0_32, A1_32
651 mov [A0], T0_32
652 IEM_SAVE_FLAGS A2, %2, %3
653 EPILOGUE_4_ARGS
654ENDPROC iemAImpl_ %+ %1 %+ _u32
655
656 %ifdef RT_ARCH_AMD64
657BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
658 PROLOGUE_4_ARGS
659 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
660 mov T0, [A0]
661 %1 T0, A1
662 mov [A0], T0
663 IEM_SAVE_FLAGS A2, %2, %3
664 EPILOGUE_4_ARGS
665ENDPROC iemAImpl_ %+ %1 %+ _u64
666 %endif ; RT_ARCH_AMD64
667%endmacro
668
669; instr, modified-flags, undefined-flags
670IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
671IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
672IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
673
674
675;;
676; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
677;
678; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
679; where the 64-bit accesses requires hand coding.
680;
681; All the functions takes a pointer to the destination memory operand in A0,
682; the first source register operand in A1, the second source register operand
683; in A2 and a pointer to eflags in A3.
684;
685; @param 1 The instruction mnemonic.
686; @param 2 Fallback instruction if applicable.
687; @param 3 Whether to emit fallback or not.
688;
689%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
690BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
691 PROLOGUE_3_ARGS
692 %1 T0_32, A1_32, A2_32
693 mov [A0], T0_32
694 EPILOGUE_3_ARGS
695ENDPROC iemAImpl_ %+ %1 %+ _u32
696
697 %if %3
698BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
699 PROLOGUE_3_ARGS
700 %ifdef ASM_CALL64_GCC
701 mov cl, A2_8
702 %2 A1_32, cl
703 mov [A0], A1_32
704 %else
705 xchg A2, A0
706 %2 A1_32, cl
707 mov [A2], A1_32
708 %endif
709 EPILOGUE_3_ARGS
710ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
711 %endif
712
713 %ifdef RT_ARCH_AMD64
714BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
715 PROLOGUE_3_ARGS
716 %1 T0, A1, A2
717 mov [A0], T0
718 EPILOGUE_3_ARGS
719ENDPROC iemAImpl_ %+ %1 %+ _u64
720
721 %if %3
722BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
723 PROLOGUE_3_ARGS
724 %ifdef ASM_CALL64_GCC
725 mov cl, A2_8
726 %2 A1, cl
727 mov [A0], A1_32
728 %else
729 xchg A2, A0
730 %2 A1, cl
731 mov [A2], A1_32
732 %endif
733 mov [A0], A1
734 EPILOGUE_3_ARGS
735ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
736 %endif
737 %endif ; RT_ARCH_AMD64
738%endmacro
739
740; instr, fallback instr, emit fallback
741IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
742IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
743IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
744IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
745IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
746
747
748;
749; RORX uses a immediate byte for the shift count, so we only do
750; fallback implementation of that one.
751;
752BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
753 PROLOGUE_3_ARGS
754 %ifdef ASM_CALL64_GCC
755 mov cl, A2_8
756 ror A1_32, cl
757 mov [A0], A1_32
758 %else
759 xchg A2, A0
760 ror A1_32, cl
761 mov [A2], A1_32
762 %endif
763 EPILOGUE_3_ARGS
764ENDPROC iemAImpl_rorx_u32
765
766 %ifdef RT_ARCH_AMD64
767BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
768 PROLOGUE_3_ARGS
769 %ifdef ASM_CALL64_GCC
770 mov cl, A2_8
771 ror A1, cl
772 mov [A0], A1
773 %else
774 xchg A2, A0
775 ror A1, cl
776 mov [A2], A1
777 %endif
778 EPILOGUE_3_ARGS
779ENDPROC iemAImpl_rorx_u64
780 %endif ; RT_ARCH_AMD64
781
782
783;
784; MULX
785;
786BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
787 PROLOGUE_4_ARGS
788%ifdef ASM_CALL64_GCC
789 ; A2_32 is EDX - prefect
790 mulx T0_32, T1_32, A3_32
791 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
792 mov [A0], T0_32
793%else
794 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
795 xchg A1, A2
796 mulx T0_32, T1_32, A3_32
797 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
798 mov [A0], T0_32
799%endif
800 EPILOGUE_4_ARGS
801ENDPROC iemAImpl_mulx_u32
802
803
804BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
805 PROLOGUE_4_ARGS
806%ifdef ASM_CALL64_GCC
807 ; A2_32 is EDX, T0_32 is EAX
808 mov eax, A3_32
809 mul A2_32
810 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
811 mov [A0], edx
812%else
813 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
814 xchg A1, A2
815 mov eax, A3_32
816 mul A2_32
817 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
818 mov [A0], edx
819%endif
820 EPILOGUE_4_ARGS
821ENDPROC iemAImpl_mulx_u32_fallback
822
823%ifdef RT_ARCH_AMD64
824BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
825 PROLOGUE_4_ARGS
826%ifdef ASM_CALL64_GCC
827 ; A2 is RDX - prefect
828 mulx T0, T1, A3
829 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
830 mov [A0], T0
831%else
832 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
833 xchg A1, A2
834 mulx T0, T1, A3
835 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
836 mov [A0], T0
837%endif
838 EPILOGUE_4_ARGS
839ENDPROC iemAImpl_mulx_u64
840
841
842BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
843 PROLOGUE_4_ARGS
844%ifdef ASM_CALL64_GCC
845 ; A2 is RDX, T0 is RAX
846 mov rax, A3
847 mul A2
848 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
849 mov [A0], rdx
850%else
851 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
852 xchg A1, A2
853 mov rax, A3
854 mul A2
855 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
856 mov [A0], rdx
857%endif
858 EPILOGUE_4_ARGS
859ENDPROC iemAImpl_mulx_u64_fallback
860
861%endif
862
863
864;;
865; Macro for implementing a bit operator.
866;
867; This will generate code for the 16, 32 and 64 bit accesses with locked
868; variants, except on 32-bit system where the 64-bit accesses requires hand
869; coding.
870;
871; All the functions takes a pointer to the destination memory operand in A0,
872; the source register operand in A1 and a pointer to eflags in A2.
873;
874; @param 1 The instruction mnemonic.
875; @param 2 Non-zero if there should be a locked version.
876; @param 3 The modified flags.
877; @param 4 The undefined flags.
878;
879%macro IEMIMPL_BIT_OP 4
880BEGINCODE
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 %1 word [A0], A1_16
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS
887ENDPROC iemAImpl_ %+ %1 %+ _u16
888
889BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
890 PROLOGUE_3_ARGS
891 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
892 %1 dword [A0], A1_32
893 IEM_SAVE_FLAGS A2, %3, %4
894 EPILOGUE_3_ARGS
895ENDPROC iemAImpl_ %+ %1 %+ _u32
896
897 %ifdef RT_ARCH_AMD64
898BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
899 PROLOGUE_3_ARGS
900 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
901 %1 qword [A0], A1
902 IEM_SAVE_FLAGS A2, %3, %4
903 EPILOGUE_3_ARGS_EX 8
904ENDPROC iemAImpl_ %+ %1 %+ _u64
905 %endif ; RT_ARCH_AMD64
906
907 %if %2 != 0 ; locked versions requested?
908
909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
910 PROLOGUE_3_ARGS
911 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
912 lock %1 word [A0], A1_16
913 IEM_SAVE_FLAGS A2, %3, %4
914 EPILOGUE_3_ARGS
915ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
916
917BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
918 PROLOGUE_3_ARGS
919 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
920 lock %1 dword [A0], A1_32
921 IEM_SAVE_FLAGS A2, %3, %4
922 EPILOGUE_3_ARGS
923ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
924
925 %ifdef RT_ARCH_AMD64
926BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
927 PROLOGUE_3_ARGS
928 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
929 lock %1 qword [A0], A1
930 IEM_SAVE_FLAGS A2, %3, %4
931 EPILOGUE_3_ARGS_EX 8
932ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
933 %endif ; RT_ARCH_AMD64
934 %endif ; locked
935%endmacro
936IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
937IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
938IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
939IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
940
941;;
942; Macro for implementing a bit search operator.
943;
944; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
945; system where the 64-bit accesses requires hand coding.
946;
947; All the functions takes a pointer to the destination memory operand in A0,
948; the source register operand in A1 and a pointer to eflags in A2.
949;
950; In the ZF case the destination register is 'undefined', however it seems that
951; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
952; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
953; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
954; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
955;
956; @param 1 The instruction mnemonic.
957; @param 2 The modified flags.
958; @param 3 The undefined flags.
959; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
960;
961%macro IEMIMPL_BIT_OP2 4
962BEGINCODE
963BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
964 PROLOGUE_3_ARGS
965 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
966 %1 T0_16, A1_16
967%if %4 != 0
968 jz .unchanged_dst
969%endif
970 mov [A0], T0_16
971.unchanged_dst:
972 IEM_SAVE_FLAGS A2, %2, %3
973 EPILOGUE_3_ARGS
974ENDPROC iemAImpl_ %+ %1 %+ _u16
975
976BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
977 PROLOGUE_3_ARGS
978 %1 T1_16, A1_16
979%if %4 != 0
980 jz .unchanged_dst
981%endif
982 mov [A0], T1_16
983 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
984 EPILOGUE_3_ARGS
985.unchanged_dst:
986 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
987 EPILOGUE_3_ARGS
988ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
989
990BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
991 PROLOGUE_3_ARGS
992 %1 T0_16, A1_16
993%if %4 != 0
994 jz .unchanged_dst
995%endif
996 mov [A0], T0_16
997.unchanged_dst:
998 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
999 EPILOGUE_3_ARGS
1000ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
1001
1002
1003BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1004 PROLOGUE_3_ARGS
1005 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1006 %1 T0_32, A1_32
1007%if %4 != 0
1008 jz .unchanged_dst
1009%endif
1010 mov [A0], T0_32
1011.unchanged_dst:
1012 IEM_SAVE_FLAGS A2, %2, %3
1013 EPILOGUE_3_ARGS
1014ENDPROC iemAImpl_ %+ %1 %+ _u32
1015
1016BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1017 PROLOGUE_3_ARGS
1018 %1 T1_32, A1_32
1019%if %4 != 0
1020 jz .unchanged_dst
1021%endif
1022 mov [A0], T1_32
1023 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1024 EPILOGUE_3_ARGS
1025.unchanged_dst:
1026 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1027 EPILOGUE_3_ARGS
1028ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1029
1030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1031 PROLOGUE_3_ARGS
1032 %1 T0_32, A1_32
1033%if %4 != 0
1034 jz .unchanged_dst
1035%endif
1036 mov [A0], T0_32
1037.unchanged_dst:
1038 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1039 EPILOGUE_3_ARGS
1040ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1041
1042
1043 %ifdef RT_ARCH_AMD64
1044
1045BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1046 PROLOGUE_3_ARGS
1047 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1048 %1 T0, A1
1049%if %4 != 0
1050 jz .unchanged_dst
1051%endif
1052 mov [A0], T0
1053.unchanged_dst:
1054 IEM_SAVE_FLAGS A2, %2, %3
1055 EPILOGUE_3_ARGS_EX 8
1056ENDPROC iemAImpl_ %+ %1 %+ _u64
1057
1058BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1059 PROLOGUE_3_ARGS
1060 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1061 %1 T1, A1
1062%if %4 != 0
1063 jz .unchanged_dst
1064%endif
1065 mov [A0], T1
1066 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1067 EPILOGUE_3_ARGS
1068.unchanged_dst:
1069 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1070 EPILOGUE_3_ARGS
1071ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1072
1073BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1074 PROLOGUE_3_ARGS
1075 %1 T0, A1
1076%if %4 != 0
1077 jz .unchanged_dst
1078%endif
1079 mov [A0], T0
1080.unchanged_dst:
1081 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1082 EPILOGUE_3_ARGS_EX 8
1083ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1084
1085 %endif ; RT_ARCH_AMD64
1086%endmacro
1087
1088IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1089IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1090IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1091IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1092
1093
1094;;
1095; Macro for implementing POPCNT.
1096;
1097; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1098; system where the 64-bit accesses requires hand coding.
1099;
1100; All the functions takes a pointer to the destination memory operand in A0,
1101; the source register operand in A1 and a pointer to eflags in A2.
1102;
1103; ASSUMES Intel and AMD set EFLAGS the same way.
1104;
1105; ASSUMES the instruction does not support memory destination.
1106;
1107; @param 1 The instruction mnemonic.
1108; @param 2 The modified flags.
1109; @param 3 The undefined flags.
1110;
1111%macro IEMIMPL_BIT_OP3 3
1112BEGINCODE
1113BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1114 PROLOGUE_3_ARGS
1115 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1116 %1 T0_16, A1_16
1117 mov [A0], T0_16
1118 IEM_SAVE_FLAGS A2, %2, %3
1119 EPILOGUE_3_ARGS
1120ENDPROC iemAImpl_ %+ %1 %+ _u16
1121
1122BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1123 PROLOGUE_3_ARGS
1124 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1125 %1 T0_32, A1_32
1126 mov [A0], T0_32
1127 IEM_SAVE_FLAGS A2, %2, %3
1128 EPILOGUE_3_ARGS
1129ENDPROC iemAImpl_ %+ %1 %+ _u32
1130
1131 %ifdef RT_ARCH_AMD64
1132BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1133 PROLOGUE_3_ARGS
1134 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1135 %1 T0, A1
1136 mov [A0], T0
1137 IEM_SAVE_FLAGS A2, %2, %3
1138 EPILOGUE_3_ARGS_EX 8
1139ENDPROC iemAImpl_ %+ %1 %+ _u64
1140 %endif ; RT_ARCH_AMD64
1141%endmacro
1142IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1143
1144
1145;
1146; IMUL is also a similar but yet different case (no lock, no mem dst).
1147; The rDX:rAX variant of imul is handled together with mul further down.
1148;
1149BEGINCODE
1150; @param 1 EFLAGS that are modified.
1151; @param 2 Undefined EFLAGS.
1152; @param 3 Function suffix.
1153; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1154; 2 for AMD (set AF, clear PF, ZF and SF).
1155%macro IEMIMPL_IMUL_TWO 4
1156BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1157 PROLOGUE_3_ARGS
1158 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1159 imul A1_16, word [A0]
1160 mov [A0], A1_16
1161 %if %4 != 1
1162 IEM_SAVE_FLAGS A2, %1, %2
1163 %else
1164 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1165 %endif
1166 EPILOGUE_3_ARGS
1167ENDPROC iemAImpl_imul_two_u16 %+ %3
1168
1169BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1170 PROLOGUE_3_ARGS
1171 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1172 imul A1_32, dword [A0]
1173 mov [A0], A1_32
1174 %if %4 != 1
1175 IEM_SAVE_FLAGS A2, %1, %2
1176 %else
1177 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1178 %endif
1179 EPILOGUE_3_ARGS
1180ENDPROC iemAImpl_imul_two_u32 %+ %3
1181
1182 %ifdef RT_ARCH_AMD64
1183BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1184 PROLOGUE_3_ARGS
1185 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1186 imul A1, qword [A0]
1187 mov [A0], A1
1188 %if %4 != 1
1189 IEM_SAVE_FLAGS A2, %1, %2
1190 %else
1191 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1192 %endif
1193 EPILOGUE_3_ARGS_EX 8
1194ENDPROC iemAImpl_imul_two_u64 %+ %3
1195 %endif ; RT_ARCH_AMD64
1196%endmacro
1197IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1198IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1199IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1200
1201
1202;
1203; XCHG for memory operands. This implies locking. No flag changes.
1204;
1205; Each function takes two arguments, first the pointer to the memory,
1206; then the pointer to the register. They all return void.
1207;
1208BEGINCODE
1209BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1210 PROLOGUE_2_ARGS
1211 mov T0_8, [A1]
1212 xchg [A0], T0_8
1213 mov [A1], T0_8
1214 EPILOGUE_2_ARGS
1215ENDPROC iemAImpl_xchg_u8_locked
1216
1217BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1218 PROLOGUE_2_ARGS
1219 mov T0_16, [A1]
1220 xchg [A0], T0_16
1221 mov [A1], T0_16
1222 EPILOGUE_2_ARGS
1223ENDPROC iemAImpl_xchg_u16_locked
1224
1225BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1226 PROLOGUE_2_ARGS
1227 mov T0_32, [A1]
1228 xchg [A0], T0_32
1229 mov [A1], T0_32
1230 EPILOGUE_2_ARGS
1231ENDPROC iemAImpl_xchg_u32_locked
1232
1233%ifdef RT_ARCH_AMD64
1234BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1235 PROLOGUE_2_ARGS
1236 mov T0, [A1]
1237 xchg [A0], T0
1238 mov [A1], T0
1239 EPILOGUE_2_ARGS
1240ENDPROC iemAImpl_xchg_u64_locked
1241%endif
1242
1243; Unlocked variants for fDisregardLock mode.
1244
1245BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1246 PROLOGUE_2_ARGS
1247 mov T0_8, [A1]
1248 mov T1_8, [A0]
1249 mov [A0], T0_8
1250 mov [A1], T1_8
1251 EPILOGUE_2_ARGS
1252ENDPROC iemAImpl_xchg_u8_unlocked
1253
1254BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1255 PROLOGUE_2_ARGS
1256 mov T0_16, [A1]
1257 mov T1_16, [A0]
1258 mov [A0], T0_16
1259 mov [A1], T1_16
1260 EPILOGUE_2_ARGS
1261ENDPROC iemAImpl_xchg_u16_unlocked
1262
1263BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1264 PROLOGUE_2_ARGS
1265 mov T0_32, [A1]
1266 mov T1_32, [A0]
1267 mov [A0], T0_32
1268 mov [A1], T1_32
1269 EPILOGUE_2_ARGS
1270ENDPROC iemAImpl_xchg_u32_unlocked
1271
1272%ifdef RT_ARCH_AMD64
1273BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1274 PROLOGUE_2_ARGS
1275 mov T0, [A1]
1276 mov T1, [A0]
1277 mov [A0], T0
1278 mov [A1], T1
1279 EPILOGUE_2_ARGS
1280ENDPROC iemAImpl_xchg_u64_unlocked
1281%endif
1282
1283
1284;
1285; XADD for memory operands.
1286;
1287; Each function takes three arguments, first the pointer to the
1288; memory/register, then the pointer to the register, and finally a pointer to
1289; eflags. They all return void.
1290;
1291BEGINCODE
1292BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1293 PROLOGUE_3_ARGS
1294 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1295 mov T0_8, [A1]
1296 xadd [A0], T0_8
1297 mov [A1], T0_8
1298 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1299 EPILOGUE_3_ARGS
1300ENDPROC iemAImpl_xadd_u8
1301
1302BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1303 PROLOGUE_3_ARGS
1304 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1305 mov T0_16, [A1]
1306 xadd [A0], T0_16
1307 mov [A1], T0_16
1308 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1309 EPILOGUE_3_ARGS
1310ENDPROC iemAImpl_xadd_u16
1311
1312BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1313 PROLOGUE_3_ARGS
1314 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1315 mov T0_32, [A1]
1316 xadd [A0], T0_32
1317 mov [A1], T0_32
1318 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1319 EPILOGUE_3_ARGS
1320ENDPROC iemAImpl_xadd_u32
1321
1322%ifdef RT_ARCH_AMD64
1323BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1324 PROLOGUE_3_ARGS
1325 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1326 mov T0, [A1]
1327 xadd [A0], T0
1328 mov [A1], T0
1329 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1330 EPILOGUE_3_ARGS
1331ENDPROC iemAImpl_xadd_u64
1332%endif ; RT_ARCH_AMD64
1333
1334BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1335 PROLOGUE_3_ARGS
1336 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1337 mov T0_8, [A1]
1338 lock xadd [A0], T0_8
1339 mov [A1], T0_8
1340 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1341 EPILOGUE_3_ARGS
1342ENDPROC iemAImpl_xadd_u8_locked
1343
1344BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1345 PROLOGUE_3_ARGS
1346 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1347 mov T0_16, [A1]
1348 lock xadd [A0], T0_16
1349 mov [A1], T0_16
1350 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1351 EPILOGUE_3_ARGS
1352ENDPROC iemAImpl_xadd_u16_locked
1353
1354BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1355 PROLOGUE_3_ARGS
1356 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1357 mov T0_32, [A1]
1358 lock xadd [A0], T0_32
1359 mov [A1], T0_32
1360 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1361 EPILOGUE_3_ARGS
1362ENDPROC iemAImpl_xadd_u32_locked
1363
1364%ifdef RT_ARCH_AMD64
1365BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1366 PROLOGUE_3_ARGS
1367 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1368 mov T0, [A1]
1369 lock xadd [A0], T0
1370 mov [A1], T0
1371 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1372 EPILOGUE_3_ARGS
1373ENDPROC iemAImpl_xadd_u64_locked
1374%endif ; RT_ARCH_AMD64
1375
1376
1377;
1378; CMPXCHG8B.
1379;
1380; These are tricky register wise, so the code is duplicated for each calling
1381; convention.
1382;
1383; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1384;
1385; C-proto:
1386; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1387; uint32_t *pEFlags));
1388;
1389; Note! Identical to iemAImpl_cmpxchg16b.
1390;
1391BEGINCODE
1392BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1393%ifdef RT_ARCH_AMD64
1394 %ifdef ASM_CALL64_MSC
1395 push rbx
1396
1397 mov r11, rdx ; pu64EaxEdx (is also T1)
1398 mov r10, rcx ; pu64Dst
1399
1400 mov ebx, [r8]
1401 mov ecx, [r8 + 4]
1402 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1403 mov eax, [r11]
1404 mov edx, [r11 + 4]
1405
1406 lock cmpxchg8b [r10]
1407
1408 mov [r11], eax
1409 mov [r11 + 4], edx
1410 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1411
1412 pop rbx
1413 ret
1414 %else
1415 push rbx
1416
1417 mov r10, rcx ; pEFlags
1418 mov r11, rdx ; pu64EbxEcx (is also T1)
1419
1420 mov ebx, [r11]
1421 mov ecx, [r11 + 4]
1422 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1423 mov eax, [rsi]
1424 mov edx, [rsi + 4]
1425
1426 lock cmpxchg8b [rdi]
1427
1428 mov [rsi], eax
1429 mov [rsi + 4], edx
1430 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1431
1432 pop rbx
1433 ret
1434
1435 %endif
1436%else
1437 push esi
1438 push edi
1439 push ebx
1440 push ebp
1441
1442 mov edi, ecx ; pu64Dst
1443 mov esi, edx ; pu64EaxEdx
1444 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1445 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1446
1447 mov ebx, [ecx]
1448 mov ecx, [ecx + 4]
1449 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1450 mov eax, [esi]
1451 mov edx, [esi + 4]
1452
1453 lock cmpxchg8b [edi]
1454
1455 mov [esi], eax
1456 mov [esi + 4], edx
1457 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1458
1459 pop ebp
1460 pop ebx
1461 pop edi
1462 pop esi
1463 ret 8
1464%endif
1465ENDPROC iemAImpl_cmpxchg8b
1466
1467BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1468 ; Lazy bird always lock prefixes cmpxchg8b.
1469 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1470ENDPROC iemAImpl_cmpxchg8b_locked
1471
1472%ifdef RT_ARCH_AMD64
1473
1474;
1475; CMPXCHG16B.
1476;
1477; These are tricky register wise, so the code is duplicated for each calling
1478; convention.
1479;
1480; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1481;
1482; C-proto:
1483; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1484; uint32_t *pEFlags));
1485;
1486; Note! Identical to iemAImpl_cmpxchg8b.
1487;
1488BEGINCODE
1489BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1490 %ifdef ASM_CALL64_MSC
1491 push rbx
1492
1493 mov r11, rdx ; pu64RaxRdx (is also T1)
1494 mov r10, rcx ; pu64Dst
1495
1496 mov rbx, [r8]
1497 mov rcx, [r8 + 8]
1498 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1499 mov rax, [r11]
1500 mov rdx, [r11 + 8]
1501
1502 lock cmpxchg16b [r10]
1503
1504 mov [r11], rax
1505 mov [r11 + 8], rdx
1506 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1507
1508 pop rbx
1509 ret
1510 %else
1511 push rbx
1512
1513 mov r10, rcx ; pEFlags
1514 mov r11, rdx ; pu64RbxRcx (is also T1)
1515
1516 mov rbx, [r11]
1517 mov rcx, [r11 + 8]
1518 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1519 mov rax, [rsi]
1520 mov rdx, [rsi + 8]
1521
1522 lock cmpxchg16b [rdi]
1523
1524 mov [rsi], rax
1525 mov [rsi + 8], rdx
1526 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1527
1528 pop rbx
1529 ret
1530
1531 %endif
1532ENDPROC iemAImpl_cmpxchg16b
1533
1534BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1535 ; Lazy bird always lock prefixes cmpxchg16b.
1536 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1537ENDPROC iemAImpl_cmpxchg16b_locked
1538
1539%endif ; RT_ARCH_AMD64
1540
1541
1542;
1543; CMPXCHG.
1544;
1545; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1546;
1547; C-proto:
1548; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1549;
1550BEGINCODE
1551%macro IEMIMPL_CMPXCHG 2
1552BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1553 PROLOGUE_4_ARGS
1554 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1555 mov al, [A1]
1556 %1 cmpxchg [A0], A2_8
1557 mov [A1], al
1558 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1559 EPILOGUE_4_ARGS
1560ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1561
1562BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1563 PROLOGUE_4_ARGS
1564 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1565 mov ax, [A1]
1566 %1 cmpxchg [A0], A2_16
1567 mov [A1], ax
1568 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1569 EPILOGUE_4_ARGS
1570ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1571
1572BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1573 PROLOGUE_4_ARGS
1574 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1575 mov eax, [A1]
1576 %1 cmpxchg [A0], A2_32
1577 mov [A1], eax
1578 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1579 EPILOGUE_4_ARGS
1580ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1581
1582BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1583%ifdef RT_ARCH_AMD64
1584 PROLOGUE_4_ARGS
1585 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1586 mov rax, [A1]
1587 %1 cmpxchg [A0], A2
1588 mov [A1], rax
1589 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1590 EPILOGUE_4_ARGS
1591%else
1592 ;
1593 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1594 ;
1595 push esi
1596 push edi
1597 push ebx
1598 push ebp
1599
1600 mov edi, ecx ; pu64Dst
1601 mov esi, edx ; pu64Rax
1602 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1603 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1604
1605 mov ebx, [ecx]
1606 mov ecx, [ecx + 4]
1607 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1608 mov eax, [esi]
1609 mov edx, [esi + 4]
1610
1611 lock cmpxchg8b [edi]
1612
1613 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1614 jz .cmpxchg8b_not_equal
1615 cmp eax, eax ; just set the other flags.
1616.store:
1617 mov [esi], eax
1618 mov [esi + 4], edx
1619 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1620
1621 pop ebp
1622 pop ebx
1623 pop edi
1624 pop esi
1625 ret 8
1626
1627.cmpxchg8b_not_equal:
1628 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1629 jne .store
1630 cmp [esi], eax
1631 jmp .store
1632
1633%endif
1634ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1635%endmacro ; IEMIMPL_CMPXCHG
1636
1637IEMIMPL_CMPXCHG , ,
1638IEMIMPL_CMPXCHG lock, _locked
1639
1640;;
1641; Macro for implementing a unary operator.
1642;
1643; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1644; variants, except on 32-bit system where the 64-bit accesses requires hand
1645; coding.
1646;
1647; All the functions takes a pointer to the destination memory operand in A0,
1648; the source register operand in A1 and a pointer to eflags in A2.
1649;
1650; @param 1 The instruction mnemonic.
1651; @param 2 The modified flags.
1652; @param 3 The undefined flags.
1653;
1654%macro IEMIMPL_UNARY_OP 3
1655BEGINCODE
1656BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1657 PROLOGUE_2_ARGS
1658 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1659 %1 byte [A0]
1660 IEM_SAVE_FLAGS A1, %2, %3
1661 EPILOGUE_2_ARGS
1662ENDPROC iemAImpl_ %+ %1 %+ _u8
1663
1664BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1665 PROLOGUE_2_ARGS
1666 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1667 lock %1 byte [A0]
1668 IEM_SAVE_FLAGS A1, %2, %3
1669 EPILOGUE_2_ARGS
1670ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1671
1672BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1673 PROLOGUE_2_ARGS
1674 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1675 %1 word [A0]
1676 IEM_SAVE_FLAGS A1, %2, %3
1677 EPILOGUE_2_ARGS
1678ENDPROC iemAImpl_ %+ %1 %+ _u16
1679
1680BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1681 PROLOGUE_2_ARGS
1682 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1683 lock %1 word [A0]
1684 IEM_SAVE_FLAGS A1, %2, %3
1685 EPILOGUE_2_ARGS
1686ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1687
1688BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1689 PROLOGUE_2_ARGS
1690 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1691 %1 dword [A0]
1692 IEM_SAVE_FLAGS A1, %2, %3
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_ %+ %1 %+ _u32
1695
1696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1697 PROLOGUE_2_ARGS
1698 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1699 lock %1 dword [A0]
1700 IEM_SAVE_FLAGS A1, %2, %3
1701 EPILOGUE_2_ARGS
1702ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1703
1704 %ifdef RT_ARCH_AMD64
1705BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1706 PROLOGUE_2_ARGS
1707 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1708 %1 qword [A0]
1709 IEM_SAVE_FLAGS A1, %2, %3
1710 EPILOGUE_2_ARGS
1711ENDPROC iemAImpl_ %+ %1 %+ _u64
1712
1713BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1714 PROLOGUE_2_ARGS
1715 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1716 lock %1 qword [A0]
1717 IEM_SAVE_FLAGS A1, %2, %3
1718 EPILOGUE_2_ARGS
1719ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1720 %endif ; RT_ARCH_AMD64
1721
1722%endmacro
1723
1724IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1725IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1726IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1727IEMIMPL_UNARY_OP not, 0, 0
1728
1729
1730;
1731; BSWAP. No flag changes.
1732;
1733; Each function takes one argument, pointer to the value to bswap
1734; (input/output). They all return void.
1735;
1736BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1737 PROLOGUE_1_ARGS
1738 mov T0_32, [A0] ; just in case any of the upper bits are used.
1739 db 66h
1740 bswap T0_32
1741 mov [A0], T0_32
1742 EPILOGUE_1_ARGS
1743ENDPROC iemAImpl_bswap_u16
1744
1745BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1746 PROLOGUE_1_ARGS
1747 mov T0_32, [A0]
1748 bswap T0_32
1749 mov [A0], T0_32
1750 EPILOGUE_1_ARGS
1751ENDPROC iemAImpl_bswap_u32
1752
1753BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1754%ifdef RT_ARCH_AMD64
1755 PROLOGUE_1_ARGS
1756 mov T0, [A0]
1757 bswap T0
1758 mov [A0], T0
1759 EPILOGUE_1_ARGS
1760%else
1761 PROLOGUE_1_ARGS
1762 mov T0, [A0]
1763 mov T1, [A0 + 4]
1764 bswap T0
1765 bswap T1
1766 mov [A0 + 4], T0
1767 mov [A0], T1
1768 EPILOGUE_1_ARGS
1769%endif
1770ENDPROC iemAImpl_bswap_u64
1771
1772
1773;;
1774; Macro for implementing a shift operation.
1775;
1776; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1777; 32-bit system where the 64-bit accesses requires hand coding.
1778;
1779; All the functions takes a pointer to the destination memory operand in A0,
1780; the shift count in A1 and a pointer to eflags in A2.
1781;
1782; @param 1 The instruction mnemonic.
1783; @param 2 The modified flags.
1784; @param 3 The undefined flags.
1785;
1786; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1787;
1788; @note the _intel and _amd variants are implemented in C.
1789;
1790%macro IEMIMPL_SHIFT_OP 3
1791BEGINCODE
1792BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1793 PROLOGUE_3_ARGS
1794 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1795 %ifdef ASM_CALL64_GCC
1796 mov cl, A1_8
1797 %1 byte [A0], cl
1798 %else
1799 xchg A1, A0
1800 %1 byte [A1], cl
1801 %endif
1802 IEM_SAVE_FLAGS A2, %2, %3
1803 EPILOGUE_3_ARGS
1804ENDPROC iemAImpl_ %+ %1 %+ _u8
1805
1806BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1807 PROLOGUE_3_ARGS
1808 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1809 %ifdef ASM_CALL64_GCC
1810 mov cl, A1_8
1811 %1 word [A0], cl
1812 %else
1813 xchg A1, A0
1814 %1 word [A1], cl
1815 %endif
1816 IEM_SAVE_FLAGS A2, %2, %3
1817 EPILOGUE_3_ARGS
1818ENDPROC iemAImpl_ %+ %1 %+ _u16
1819
1820BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1821 PROLOGUE_3_ARGS
1822 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1823 %ifdef ASM_CALL64_GCC
1824 mov cl, A1_8
1825 %1 dword [A0], cl
1826 %else
1827 xchg A1, A0
1828 %1 dword [A1], cl
1829 %endif
1830 IEM_SAVE_FLAGS A2, %2, %3
1831 EPILOGUE_3_ARGS
1832ENDPROC iemAImpl_ %+ %1 %+ _u32
1833
1834 %ifdef RT_ARCH_AMD64
1835BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1836 PROLOGUE_3_ARGS
1837 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1838 %ifdef ASM_CALL64_GCC
1839 mov cl, A1_8
1840 %1 qword [A0], cl
1841 %else
1842 xchg A1, A0
1843 %1 qword [A1], cl
1844 %endif
1845 IEM_SAVE_FLAGS A2, %2, %3
1846 EPILOGUE_3_ARGS
1847ENDPROC iemAImpl_ %+ %1 %+ _u64
1848 %endif ; RT_ARCH_AMD64
1849
1850%endmacro
1851
1852IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1853IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1854IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1855IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1856IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1857IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1858IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1859
1860
1861;;
1862; Macro for implementing a double precision shift operation.
1863;
1864; This will generate code for the 16, 32 and 64 bit accesses, except on
1865; 32-bit system where the 64-bit accesses requires hand coding.
1866;
1867; The functions takes the destination operand (r/m) in A0, the source (reg) in
1868; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1869;
1870; @param 1 The instruction mnemonic.
1871; @param 2 The modified flags.
1872; @param 3 The undefined flags.
1873;
1874; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1875;
1876; @note the _intel and _amd variants are implemented in C.
1877;
1878%macro IEMIMPL_SHIFT_DBL_OP 3
1879BEGINCODE
1880BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1881 PROLOGUE_4_ARGS
1882 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1883 %ifdef ASM_CALL64_GCC
1884 xchg A3, A2
1885 %1 [A0], A1_16, cl
1886 xchg A3, A2
1887 %else
1888 xchg A0, A2
1889 %1 [A2], A1_16, cl
1890 %endif
1891 IEM_SAVE_FLAGS A3, %2, %3
1892 EPILOGUE_4_ARGS
1893ENDPROC iemAImpl_ %+ %1 %+ _u16
1894
1895BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1896 PROLOGUE_4_ARGS
1897 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1898 %ifdef ASM_CALL64_GCC
1899 xchg A3, A2
1900 %1 [A0], A1_32, cl
1901 xchg A3, A2
1902 %else
1903 xchg A0, A2
1904 %1 [A2], A1_32, cl
1905 %endif
1906 IEM_SAVE_FLAGS A3, %2, %3
1907 EPILOGUE_4_ARGS
1908ENDPROC iemAImpl_ %+ %1 %+ _u32
1909
1910 %ifdef RT_ARCH_AMD64
1911BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1912 PROLOGUE_4_ARGS
1913 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1914 %ifdef ASM_CALL64_GCC
1915 xchg A3, A2
1916 %1 [A0], A1, cl
1917 xchg A3, A2
1918 %else
1919 xchg A0, A2
1920 %1 [A2], A1, cl
1921 %endif
1922 IEM_SAVE_FLAGS A3, %2, %3
1923 EPILOGUE_4_ARGS_EX 12
1924ENDPROC iemAImpl_ %+ %1 %+ _u64
1925 %endif ; RT_ARCH_AMD64
1926
1927%endmacro
1928
1929IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1930IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1931
1932
1933;;
1934; Macro for implementing a multiplication operations.
1935;
1936; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1937; 32-bit system where the 64-bit accesses requires hand coding.
1938;
1939; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1940; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1941; pointer to eflags in A3.
1942;
1943; The functions all return 0 so the caller can be used for div/idiv as well as
1944; for the mul/imul implementation.
1945;
1946; @param 1 The instruction mnemonic.
1947; @param 2 The modified flags.
1948; @param 3 The undefined flags.
1949; @param 4 Name suffix.
1950; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1951;
1952; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1953;
1954%macro IEMIMPL_MUL_OP 5
1955BEGINCODE
1956BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1957 PROLOGUE_3_ARGS
1958 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1959 mov al, [A0]
1960 %1 A1_8
1961 mov [A0], ax
1962 %if %5 != 1
1963 IEM_SAVE_FLAGS A2, %2, %3
1964 %else
1965 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1966 %endif
1967 xor eax, eax
1968 EPILOGUE_3_ARGS
1969ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1970
1971BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1972 PROLOGUE_4_ARGS
1973 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1974 mov ax, [A0]
1975 %ifdef ASM_CALL64_GCC
1976 %1 A2_16
1977 mov [A0], ax
1978 mov [A1], dx
1979 %else
1980 mov T1, A1
1981 %1 A2_16
1982 mov [A0], ax
1983 mov [T1], dx
1984 %endif
1985 %if %5 != 1
1986 IEM_SAVE_FLAGS A3, %2, %3
1987 %else
1988 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1989 %endif
1990 xor eax, eax
1991 EPILOGUE_4_ARGS
1992ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1993
1994BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1995 PROLOGUE_4_ARGS
1996 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1997 mov eax, [A0]
1998 %ifdef ASM_CALL64_GCC
1999 %1 A2_32
2000 mov [A0], eax
2001 mov [A1], edx
2002 %else
2003 mov T1, A1
2004 %1 A2_32
2005 mov [A0], eax
2006 mov [T1], edx
2007 %endif
2008 %if %5 != 1
2009 IEM_SAVE_FLAGS A3, %2, %3
2010 %else
2011 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
2012 %endif
2013 xor eax, eax
2014 EPILOGUE_4_ARGS
2015ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
2016
2017 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2018BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2019 PROLOGUE_4_ARGS
2020 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2021 mov rax, [A0]
2022 %ifdef ASM_CALL64_GCC
2023 %1 A2
2024 mov [A0], rax
2025 mov [A1], rdx
2026 %else
2027 mov T1, A1
2028 %1 A2
2029 mov [A0], rax
2030 mov [T1], rdx
2031 %endif
2032 %if %5 != 1
2033 IEM_SAVE_FLAGS A3, %2, %3
2034 %else
2035 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2036 %endif
2037 xor eax, eax
2038 EPILOGUE_4_ARGS_EX 12
2039ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2040 %endif ; !RT_ARCH_AMD64
2041
2042%endmacro
2043
2044IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2045IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2046IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2047IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2048IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2049IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2050
2051
2052BEGINCODE
2053;;
2054; Worker function for negating a 32-bit number in T1:T0
2055; @uses None (T0,T1)
2056BEGINPROC iemAImpl_negate_T0_T1_u32
2057 push 0
2058 push 0
2059 xchg T0_32, [xSP]
2060 xchg T1_32, [xSP + xCB]
2061 sub T0_32, [xSP]
2062 sbb T1_32, [xSP + xCB]
2063 add xSP, xCB*2
2064 ret
2065ENDPROC iemAImpl_negate_T0_T1_u32
2066
2067%ifdef RT_ARCH_AMD64
2068;;
2069; Worker function for negating a 64-bit number in T1:T0
2070; @uses None (T0,T1)
2071BEGINPROC iemAImpl_negate_T0_T1_u64
2072 push 0
2073 push 0
2074 xchg T0, [xSP]
2075 xchg T1, [xSP + xCB]
2076 sub T0, [xSP]
2077 sbb T1, [xSP + xCB]
2078 add xSP, xCB*2
2079 ret
2080ENDPROC iemAImpl_negate_T0_T1_u64
2081%endif
2082
2083
2084;;
2085; Macro for implementing a division operations.
2086;
2087; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2088; 32-bit system where the 64-bit accesses requires hand coding.
2089;
2090; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2091; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2092; pointer to eflags in A3.
2093;
2094; The functions all return 0 on success and -1 if a divide error should be
2095; raised by the caller.
2096;
2097; @param 1 The instruction mnemonic.
2098; @param 2 The modified flags.
2099; @param 3 The undefined flags.
2100; @param 4 1 if signed, 0 if unsigned.
2101; @param 5 Function suffix.
2102; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2103; 2 for AMD (set AF, clear PF, ZF and SF).
2104;
2105; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2106;
2107%macro IEMIMPL_DIV_OP 6
2108BEGINCODE
2109BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2110 PROLOGUE_3_ARGS
2111
2112 ; div by chainsaw check.
2113 test A1_8, A1_8
2114 jz .div_zero
2115
2116 ; Overflow check - unsigned division is simple to verify, haven't
2117 ; found a simple way to check signed division yet unfortunately.
2118 %if %4 == 0
2119 cmp [A0 + 1], A1_8
2120 jae .div_overflow
2121 %else
2122 mov T0_16, [A0] ; T0 = dividend
2123 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2124 test A1_8, A1_8
2125 js .divisor_negative
2126 test T0_16, T0_16
2127 jns .both_positive
2128 neg T0_16
2129.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2130 push T0 ; Start off like unsigned below.
2131 shr T0_16, 7
2132 cmp T0_8, A1_8
2133 pop T0
2134 jb .div_no_overflow
2135 ja .div_overflow
2136 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2137 cmp T0_8, A1_8
2138 jae .div_overflow
2139 jmp .div_no_overflow
2140
2141.divisor_negative:
2142 neg A1_8
2143 test T0_16, T0_16
2144 jns .one_of_each
2145 neg T0_16
2146.both_positive: ; Same as unsigned shifted by sign indicator bit.
2147 shr T0_16, 7
2148 cmp T0_8, A1_8
2149 jae .div_overflow
2150.div_no_overflow:
2151 mov A1, T1 ; restore divisor
2152 %endif
2153
2154 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2155 mov ax, [A0]
2156 %1 A1_8
2157 mov [A0], ax
2158 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2159 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2160 %else
2161 IEM_SAVE_FLAGS A2, %2, %3
2162 %endif
2163 xor eax, eax
2164
2165.return:
2166 EPILOGUE_3_ARGS
2167
2168.div_zero:
2169.div_overflow:
2170 mov eax, -1
2171 jmp .return
2172ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2173
2174BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2175 PROLOGUE_4_ARGS
2176
2177 ; div by chainsaw check.
2178 test A2_16, A2_16
2179 jz .div_zero
2180
2181 ; Overflow check - unsigned division is simple to verify, haven't
2182 ; found a simple way to check signed division yet unfortunately.
2183 %if %4 == 0
2184 cmp [A1], A2_16
2185 jae .div_overflow
2186 %else
2187 mov T0_16, [A1]
2188 shl T0_32, 16
2189 mov T0_16, [A0] ; T0 = dividend
2190 mov T1, A2 ; T1 = divisor
2191 test T1_16, T1_16
2192 js .divisor_negative
2193 test T0_32, T0_32
2194 jns .both_positive
2195 neg T0_32
2196.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2197 push T0 ; Start off like unsigned below.
2198 shr T0_32, 15
2199 cmp T0_16, T1_16
2200 pop T0
2201 jb .div_no_overflow
2202 ja .div_overflow
2203 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2204 cmp T0_16, T1_16
2205 jae .div_overflow
2206 jmp .div_no_overflow
2207
2208.divisor_negative:
2209 neg T1_16
2210 test T0_32, T0_32
2211 jns .one_of_each
2212 neg T0_32
2213.both_positive: ; Same as unsigned shifted by sign indicator bit.
2214 shr T0_32, 15
2215 cmp T0_16, T1_16
2216 jae .div_overflow
2217.div_no_overflow:
2218 %endif
2219
2220 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2221 %ifdef ASM_CALL64_GCC
2222 mov T1, A2
2223 mov ax, [A0]
2224 mov dx, [A1]
2225 %1 T1_16
2226 mov [A0], ax
2227 mov [A1], dx
2228 %else
2229 mov T1, A1
2230 mov ax, [A0]
2231 mov dx, [T1]
2232 %1 A2_16
2233 mov [A0], ax
2234 mov [T1], dx
2235 %endif
2236 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2237 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2238 %else
2239 IEM_SAVE_FLAGS A3, %2, %3
2240 %endif
2241 xor eax, eax
2242
2243.return:
2244 EPILOGUE_4_ARGS
2245
2246.div_zero:
2247.div_overflow:
2248 mov eax, -1
2249 jmp .return
2250ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2251
2252BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2253 PROLOGUE_4_ARGS
2254
2255 ; div by chainsaw check.
2256 test A2_32, A2_32
2257 jz .div_zero
2258
2259 ; Overflow check - unsigned division is simple to verify, haven't
2260 ; found a simple way to check signed division yet unfortunately.
2261 %if %4 == 0
2262 cmp [A1], A2_32
2263 jae .div_overflow
2264 %else
2265 push A2 ; save A2 so we modify it (we out of regs on x86).
2266 mov T0_32, [A0] ; T0 = dividend low
2267 mov T1_32, [A1] ; T1 = dividend high
2268 test A2_32, A2_32
2269 js .divisor_negative
2270 test T1_32, T1_32
2271 jns .both_positive
2272 call NAME(iemAImpl_negate_T0_T1_u32)
2273.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2274 push T0 ; Start off like unsigned below.
2275 shl T1_32, 1
2276 shr T0_32, 31
2277 or T1_32, T0_32
2278 cmp T1_32, A2_32
2279 pop T0
2280 jb .div_no_overflow
2281 ja .div_overflow
2282 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2283 cmp T0_32, A2_32
2284 jae .div_overflow
2285 jmp .div_no_overflow
2286
2287.divisor_negative:
2288 neg A2_32
2289 test T1_32, T1_32
2290 jns .one_of_each
2291 call NAME(iemAImpl_negate_T0_T1_u32)
2292.both_positive: ; Same as unsigned shifted by sign indicator bit.
2293 shl T1_32, 1
2294 shr T0_32, 31
2295 or T1_32, T0_32
2296 cmp T1_32, A2_32
2297 jae .div_overflow
2298.div_no_overflow:
2299 pop A2
2300 %endif
2301
2302 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2303 mov eax, [A0]
2304 %ifdef ASM_CALL64_GCC
2305 mov T1, A2
2306 mov eax, [A0]
2307 mov edx, [A1]
2308 %1 T1_32
2309 mov [A0], eax
2310 mov [A1], edx
2311 %else
2312 mov T1, A1
2313 mov eax, [A0]
2314 mov edx, [T1]
2315 %1 A2_32
2316 mov [A0], eax
2317 mov [T1], edx
2318 %endif
2319 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2320 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2321 %else
2322 IEM_SAVE_FLAGS A3, %2, %3
2323 %endif
2324 xor eax, eax
2325
2326.return:
2327 EPILOGUE_4_ARGS
2328
2329.div_overflow:
2330 %if %4 != 0
2331 pop A2
2332 %endif
2333.div_zero:
2334 mov eax, -1
2335 jmp .return
2336ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2337
2338 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2339BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2340 PROLOGUE_4_ARGS
2341
2342 test A2, A2
2343 jz .div_zero
2344 %if %4 == 0
2345 cmp [A1], A2
2346 jae .div_overflow
2347 %else
2348 push A2 ; save A2 so we modify it (we out of regs on x86).
2349 mov T0, [A0] ; T0 = dividend low
2350 mov T1, [A1] ; T1 = dividend high
2351 test A2, A2
2352 js .divisor_negative
2353 test T1, T1
2354 jns .both_positive
2355 call NAME(iemAImpl_negate_T0_T1_u64)
2356.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2357 push T0 ; Start off like unsigned below.
2358 shl T1, 1
2359 shr T0, 63
2360 or T1, T0
2361 cmp T1, A2
2362 pop T0
2363 jb .div_no_overflow
2364 ja .div_overflow
2365 mov T1, 0x7fffffffffffffff
2366 and T0, T1 ; Special case for covering (divisor - 1).
2367 cmp T0, A2
2368 jae .div_overflow
2369 jmp .div_no_overflow
2370
2371.divisor_negative:
2372 neg A2
2373 test T1, T1
2374 jns .one_of_each
2375 call NAME(iemAImpl_negate_T0_T1_u64)
2376.both_positive: ; Same as unsigned shifted by sign indicator bit.
2377 shl T1, 1
2378 shr T0, 63
2379 or T1, T0
2380 cmp T1, A2
2381 jae .div_overflow
2382.div_no_overflow:
2383 pop A2
2384 %endif
2385
2386 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2387 mov rax, [A0]
2388 %ifdef ASM_CALL64_GCC
2389 mov T1, A2
2390 mov rax, [A0]
2391 mov rdx, [A1]
2392 %1 T1
2393 mov [A0], rax
2394 mov [A1], rdx
2395 %else
2396 mov T1, A1
2397 mov rax, [A0]
2398 mov rdx, [T1]
2399 %1 A2
2400 mov [A0], rax
2401 mov [T1], rdx
2402 %endif
2403 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2404 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2405 %else
2406 IEM_SAVE_FLAGS A3, %2, %3
2407 %endif
2408 xor eax, eax
2409
2410.return:
2411 EPILOGUE_4_ARGS_EX 12
2412
2413.div_overflow:
2414 %if %4 != 0
2415 pop A2
2416 %endif
2417.div_zero:
2418 mov eax, -1
2419 jmp .return
2420ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2421 %endif ; !RT_ARCH_AMD64
2422
2423%endmacro
2424
2425IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2426IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2427IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2428IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2429IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2430IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2431
2432
2433;;
2434; Macro for implementing memory fence operation.
2435;
2436; No return value, no operands or anything.
2437;
2438; @param 1 The instruction.
2439;
2440%macro IEMIMPL_MEM_FENCE 1
2441BEGINCODE
2442BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2443 %1
2444 ret
2445ENDPROC iemAImpl_ %+ %1
2446%endmacro
2447
2448IEMIMPL_MEM_FENCE lfence
2449IEMIMPL_MEM_FENCE sfence
2450IEMIMPL_MEM_FENCE mfence
2451
2452;;
2453; Alternative for non-SSE2 host.
2454;
2455BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2456 push xAX
2457 xchg xAX, [xSP]
2458 add xSP, xCB
2459 ret
2460ENDPROC iemAImpl_alt_mem_fence
2461
2462
2463;;
2464; Initialize the FPU for the actual instruction being emulated, this means
2465; loading parts of the guest's control word and status word.
2466;
2467; @uses 24 bytes of stack. T0, T1
2468; @param 1 Expression giving the address of the FXSTATE of the guest.
2469;
2470%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2471 fnstenv [xSP]
2472
2473 ; FCW - for exception, precision and rounding control.
2474 movzx T0, word [%1 + X86FXSTATE.FCW]
2475 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2476 mov [xSP + X86FSTENV32P.FCW], T0_16
2477
2478 ; FSW - for undefined C0, C1, C2, and C3.
2479 movzx T1, word [%1 + X86FXSTATE.FSW]
2480 and T1, X86_FSW_C_MASK
2481 movzx T0, word [xSP + X86FSTENV32P.FSW]
2482 and T0, X86_FSW_TOP_MASK
2483 or T0, T1
2484 mov [xSP + X86FSTENV32P.FSW], T0_16
2485
2486 fldenv [xSP]
2487%endmacro
2488
2489
2490;;
2491; Initialize the FPU for the actual instruction being emulated, this means
2492; loading parts of the guest's control word, status word, and update the
2493; tag word for the top register if it's empty.
2494;
2495; ASSUMES actual TOP=7
2496;
2497; @uses 24 bytes of stack. T0, T1
2498; @param 1 Expression giving the address of the FXSTATE of the guest.
2499;
2500%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2501 fnstenv [xSP]
2502
2503 ; FCW - for exception, precision and rounding control.
2504 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2505 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2506 mov [xSP + X86FSTENV32P.FCW], T0_16
2507
2508 ; FSW - for undefined C0, C1, C2, and C3.
2509 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2510 and T1_32, X86_FSW_C_MASK
2511 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2512 and T0_32, X86_FSW_TOP_MASK
2513 or T0_32, T1_32
2514 mov [xSP + X86FSTENV32P.FSW], T0_16
2515
2516 ; FTW - Only for ST0 (in/out).
2517 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2518 shr T1_32, X86_FSW_TOP_SHIFT
2519 and T1_32, X86_FSW_TOP_SMASK
2520 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2521 jc %%st0_not_empty
2522 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2523%%st0_not_empty:
2524
2525 fldenv [xSP]
2526%endmacro
2527
2528
2529;;
2530; Need to move this as well somewhere better?
2531;
2532struc IEMFPURESULT
2533 .r80Result resw 5
2534 .FSW resw 1
2535endstruc
2536
2537
2538;;
2539; Need to move this as well somewhere better?
2540;
2541struc IEMFPURESULTTWO
2542 .r80Result1 resw 5
2543 .FSW resw 1
2544 .r80Result2 resw 5
2545endstruc
2546
2547
2548;
2549;---------------------- 16-bit signed integer operations ----------------------
2550;
2551
2552
2553;;
2554; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2555;
2556; @param A0 FPU context (fxsave).
2557; @param A1 Pointer to a IEMFPURESULT for the output.
2558; @param A2 Pointer to the 16-bit floating point value to convert.
2559;
2560BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2561 PROLOGUE_3_ARGS
2562 sub xSP, 20h
2563
2564 fninit
2565 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2566 fild word [A2]
2567
2568 fnstsw word [A1 + IEMFPURESULT.FSW]
2569 fnclex
2570 fstp tword [A1 + IEMFPURESULT.r80Result]
2571
2572 fninit
2573 add xSP, 20h
2574 EPILOGUE_3_ARGS
2575ENDPROC iemAImpl_fild_r80_from_i16
2576
2577
2578;;
2579; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2580;
2581; @param A0 FPU context (fxsave).
2582; @param A1 Where to return the output FSW.
2583; @param A2 Where to store the 16-bit signed integer value.
2584; @param A3 Pointer to the 80-bit value.
2585;
2586BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2587 PROLOGUE_4_ARGS
2588 sub xSP, 20h
2589
2590 fninit
2591 fld tword [A3]
2592 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2593 fistp word [A2]
2594
2595 fnstsw word [A1]
2596
2597 fninit
2598 add xSP, 20h
2599 EPILOGUE_4_ARGS
2600ENDPROC iemAImpl_fist_r80_to_i16
2601
2602
2603;;
2604; Store a 80-bit floating point value (register) as a 16-bit signed integer
2605; (memory) with truncation.
2606;
2607; @param A0 FPU context (fxsave).
2608; @param A1 Where to return the output FSW.
2609; @param A2 Where to store the 16-bit signed integer value.
2610; @param A3 Pointer to the 80-bit value.
2611;
2612BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2613 PROLOGUE_4_ARGS
2614 sub xSP, 20h
2615
2616 fninit
2617 fld tword [A3]
2618 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2619 fisttp word [A2]
2620
2621 fnstsw word [A1]
2622
2623 fninit
2624 add xSP, 20h
2625 EPILOGUE_4_ARGS
2626ENDPROC iemAImpl_fistt_r80_to_i16
2627
2628
2629;;
2630; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2631;
2632; @param 1 The instruction
2633;
2634; @param A0 FPU context (fxsave).
2635; @param A1 Pointer to a IEMFPURESULT for the output.
2636; @param A2 Pointer to the 80-bit value.
2637; @param A3 Pointer to the 16-bit value.
2638;
2639%macro IEMIMPL_FPU_R80_BY_I16 1
2640BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2641 PROLOGUE_4_ARGS
2642 sub xSP, 20h
2643
2644 fninit
2645 fld tword [A2]
2646 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2647 %1 word [A3]
2648
2649 fnstsw word [A1 + IEMFPURESULT.FSW]
2650 fnclex
2651 fstp tword [A1 + IEMFPURESULT.r80Result]
2652
2653 fninit
2654 add xSP, 20h
2655 EPILOGUE_4_ARGS
2656ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2657%endmacro
2658
2659IEMIMPL_FPU_R80_BY_I16 fiadd
2660IEMIMPL_FPU_R80_BY_I16 fimul
2661IEMIMPL_FPU_R80_BY_I16 fisub
2662IEMIMPL_FPU_R80_BY_I16 fisubr
2663IEMIMPL_FPU_R80_BY_I16 fidiv
2664IEMIMPL_FPU_R80_BY_I16 fidivr
2665
2666
2667;;
2668; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2669; only returning FSW.
2670;
2671; @param 1 The instruction
2672;
2673; @param A0 FPU context (fxsave).
2674; @param A1 Where to store the output FSW.
2675; @param A2 Pointer to the 80-bit value.
2676; @param A3 Pointer to the 64-bit value.
2677;
2678%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2679BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2680 PROLOGUE_4_ARGS
2681 sub xSP, 20h
2682
2683 fninit
2684 fld tword [A2]
2685 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2686 %1 word [A3]
2687
2688 fnstsw word [A1]
2689
2690 fninit
2691 add xSP, 20h
2692 EPILOGUE_4_ARGS
2693ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2694%endmacro
2695
2696IEMIMPL_FPU_R80_BY_I16_FSW ficom
2697
2698
2699
2700;
2701;---------------------- 32-bit signed integer operations ----------------------
2702;
2703
2704
2705;;
2706; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2707;
2708; @param A0 FPU context (fxsave).
2709; @param A1 Pointer to a IEMFPURESULT for the output.
2710; @param A2 Pointer to the 32-bit floating point value to convert.
2711;
2712BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2713 PROLOGUE_3_ARGS
2714 sub xSP, 20h
2715
2716 fninit
2717 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2718 fild dword [A2]
2719
2720 fnstsw word [A1 + IEMFPURESULT.FSW]
2721 fnclex
2722 fstp tword [A1 + IEMFPURESULT.r80Result]
2723
2724 fninit
2725 add xSP, 20h
2726 EPILOGUE_3_ARGS
2727ENDPROC iemAImpl_fild_r80_from_i32
2728
2729
2730;;
2731; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2732;
2733; @param A0 FPU context (fxsave).
2734; @param A1 Where to return the output FSW.
2735; @param A2 Where to store the 32-bit signed integer value.
2736; @param A3 Pointer to the 80-bit value.
2737;
2738BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2739 PROLOGUE_4_ARGS
2740 sub xSP, 20h
2741
2742 fninit
2743 fld tword [A3]
2744 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2745 fistp dword [A2]
2746
2747 fnstsw word [A1]
2748
2749 fninit
2750 add xSP, 20h
2751 EPILOGUE_4_ARGS
2752ENDPROC iemAImpl_fist_r80_to_i32
2753
2754
2755;;
2756; Store a 80-bit floating point value (register) as a 32-bit signed integer
2757; (memory) with truncation.
2758;
2759; @param A0 FPU context (fxsave).
2760; @param A1 Where to return the output FSW.
2761; @param A2 Where to store the 32-bit signed integer value.
2762; @param A3 Pointer to the 80-bit value.
2763;
2764BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2765 PROLOGUE_4_ARGS
2766 sub xSP, 20h
2767
2768 fninit
2769 fld tword [A3]
2770 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2771 fisttp dword [A2]
2772
2773 fnstsw word [A1]
2774
2775 fninit
2776 add xSP, 20h
2777 EPILOGUE_4_ARGS
2778ENDPROC iemAImpl_fistt_r80_to_i32
2779
2780
2781;;
2782; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2783;
2784; @param 1 The instruction
2785;
2786; @param A0 FPU context (fxsave).
2787; @param A1 Pointer to a IEMFPURESULT for the output.
2788; @param A2 Pointer to the 80-bit value.
2789; @param A3 Pointer to the 32-bit value.
2790;
2791%macro IEMIMPL_FPU_R80_BY_I32 1
2792BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2793 PROLOGUE_4_ARGS
2794 sub xSP, 20h
2795
2796 fninit
2797 fld tword [A2]
2798 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2799 %1 dword [A3]
2800
2801 fnstsw word [A1 + IEMFPURESULT.FSW]
2802 fnclex
2803 fstp tword [A1 + IEMFPURESULT.r80Result]
2804
2805 fninit
2806 add xSP, 20h
2807 EPILOGUE_4_ARGS
2808ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2809%endmacro
2810
2811IEMIMPL_FPU_R80_BY_I32 fiadd
2812IEMIMPL_FPU_R80_BY_I32 fimul
2813IEMIMPL_FPU_R80_BY_I32 fisub
2814IEMIMPL_FPU_R80_BY_I32 fisubr
2815IEMIMPL_FPU_R80_BY_I32 fidiv
2816IEMIMPL_FPU_R80_BY_I32 fidivr
2817
2818
2819;;
2820; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2821; only returning FSW.
2822;
2823; @param 1 The instruction
2824;
2825; @param A0 FPU context (fxsave).
2826; @param A1 Where to store the output FSW.
2827; @param A2 Pointer to the 80-bit value.
2828; @param A3 Pointer to the 64-bit value.
2829;
2830%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2831BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2832 PROLOGUE_4_ARGS
2833 sub xSP, 20h
2834
2835 fninit
2836 fld tword [A2]
2837 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2838 %1 dword [A3]
2839
2840 fnstsw word [A1]
2841
2842 fninit
2843 add xSP, 20h
2844 EPILOGUE_4_ARGS
2845ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2846%endmacro
2847
2848IEMIMPL_FPU_R80_BY_I32_FSW ficom
2849
2850
2851
2852;
2853;---------------------- 64-bit signed integer operations ----------------------
2854;
2855
2856
2857;;
2858; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2859;
2860; @param A0 FPU context (fxsave).
2861; @param A1 Pointer to a IEMFPURESULT for the output.
2862; @param A2 Pointer to the 64-bit floating point value to convert.
2863;
2864BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2865 PROLOGUE_3_ARGS
2866 sub xSP, 20h
2867
2868 fninit
2869 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2870 fild qword [A2]
2871
2872 fnstsw word [A1 + IEMFPURESULT.FSW]
2873 fnclex
2874 fstp tword [A1 + IEMFPURESULT.r80Result]
2875
2876 fninit
2877 add xSP, 20h
2878 EPILOGUE_3_ARGS
2879ENDPROC iemAImpl_fild_r80_from_i64
2880
2881
2882;;
2883; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2884;
2885; @param A0 FPU context (fxsave).
2886; @param A1 Where to return the output FSW.
2887; @param A2 Where to store the 64-bit signed integer value.
2888; @param A3 Pointer to the 80-bit value.
2889;
2890BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2891 PROLOGUE_4_ARGS
2892 sub xSP, 20h
2893
2894 fninit
2895 fld tword [A3]
2896 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2897 fistp qword [A2]
2898
2899 fnstsw word [A1]
2900
2901 fninit
2902 add xSP, 20h
2903 EPILOGUE_4_ARGS
2904ENDPROC iemAImpl_fist_r80_to_i64
2905
2906
2907;;
2908; Store a 80-bit floating point value (register) as a 64-bit signed integer
2909; (memory) with truncation.
2910;
2911; @param A0 FPU context (fxsave).
2912; @param A1 Where to return the output FSW.
2913; @param A2 Where to store the 64-bit signed integer value.
2914; @param A3 Pointer to the 80-bit value.
2915;
2916BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2917 PROLOGUE_4_ARGS
2918 sub xSP, 20h
2919
2920 fninit
2921 fld tword [A3]
2922 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2923 fisttp qword [A2]
2924
2925 fnstsw word [A1]
2926
2927 fninit
2928 add xSP, 20h
2929 EPILOGUE_4_ARGS
2930ENDPROC iemAImpl_fistt_r80_to_i64
2931
2932
2933
2934;
2935;---------------------- 32-bit floating point operations ----------------------
2936;
2937
2938;;
2939; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2940;
2941; @param A0 FPU context (fxsave).
2942; @param A1 Pointer to a IEMFPURESULT for the output.
2943; @param A2 Pointer to the 32-bit floating point value to convert.
2944;
2945BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2946 PROLOGUE_3_ARGS
2947 sub xSP, 20h
2948
2949 fninit
2950 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2951 fld dword [A2]
2952
2953 fnstsw word [A1 + IEMFPURESULT.FSW]
2954 fnclex
2955 fstp tword [A1 + IEMFPURESULT.r80Result]
2956
2957 fninit
2958 add xSP, 20h
2959 EPILOGUE_3_ARGS
2960ENDPROC iemAImpl_fld_r80_from_r32
2961
2962
2963;;
2964; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2965;
2966; @param A0 FPU context (fxsave).
2967; @param A1 Where to return the output FSW.
2968; @param A2 Where to store the 32-bit value.
2969; @param A3 Pointer to the 80-bit value.
2970;
2971BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2972 PROLOGUE_4_ARGS
2973 sub xSP, 20h
2974
2975 fninit
2976 fld tword [A3]
2977 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2978 fst dword [A2]
2979
2980 fnstsw word [A1]
2981
2982 fninit
2983 add xSP, 20h
2984 EPILOGUE_4_ARGS
2985ENDPROC iemAImpl_fst_r80_to_r32
2986
2987
2988;;
2989; FPU instruction working on one 80-bit and one 32-bit floating point value.
2990;
2991; @param 1 The instruction
2992;
2993; @param A0 FPU context (fxsave).
2994; @param A1 Pointer to a IEMFPURESULT for the output.
2995; @param A2 Pointer to the 80-bit value.
2996; @param A3 Pointer to the 32-bit value.
2997;
2998%macro IEMIMPL_FPU_R80_BY_R32 1
2999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3000 PROLOGUE_4_ARGS
3001 sub xSP, 20h
3002
3003 fninit
3004 fld tword [A2]
3005 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3006 %1 dword [A3]
3007
3008 fnstsw word [A1 + IEMFPURESULT.FSW]
3009 fnclex
3010 fstp tword [A1 + IEMFPURESULT.r80Result]
3011
3012 fninit
3013 add xSP, 20h
3014 EPILOGUE_4_ARGS
3015ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3016%endmacro
3017
3018IEMIMPL_FPU_R80_BY_R32 fadd
3019IEMIMPL_FPU_R80_BY_R32 fmul
3020IEMIMPL_FPU_R80_BY_R32 fsub
3021IEMIMPL_FPU_R80_BY_R32 fsubr
3022IEMIMPL_FPU_R80_BY_R32 fdiv
3023IEMIMPL_FPU_R80_BY_R32 fdivr
3024
3025
3026;;
3027; FPU instruction working on one 80-bit and one 32-bit floating point value,
3028; only returning FSW.
3029;
3030; @param 1 The instruction
3031;
3032; @param A0 FPU context (fxsave).
3033; @param A1 Where to store the output FSW.
3034; @param A2 Pointer to the 80-bit value.
3035; @param A3 Pointer to the 64-bit value.
3036;
3037%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3038BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3039 PROLOGUE_4_ARGS
3040 sub xSP, 20h
3041
3042 fninit
3043 fld tword [A2]
3044 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3045 %1 dword [A3]
3046
3047 fnstsw word [A1]
3048
3049 fninit
3050 add xSP, 20h
3051 EPILOGUE_4_ARGS
3052ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3053%endmacro
3054
3055IEMIMPL_FPU_R80_BY_R32_FSW fcom
3056
3057
3058
3059;
3060;---------------------- 64-bit floating point operations ----------------------
3061;
3062
3063;;
3064; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3065;
3066; @param A0 FPU context (fxsave).
3067; @param A1 Pointer to a IEMFPURESULT for the output.
3068; @param A2 Pointer to the 64-bit floating point value to convert.
3069;
3070BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3071 PROLOGUE_3_ARGS
3072 sub xSP, 20h
3073
3074 fninit
3075 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3076 fld qword [A2]
3077
3078 fnstsw word [A1 + IEMFPURESULT.FSW]
3079 fnclex
3080 fstp tword [A1 + IEMFPURESULT.r80Result]
3081
3082 fninit
3083 add xSP, 20h
3084 EPILOGUE_3_ARGS
3085ENDPROC iemAImpl_fld_r80_from_r64
3086
3087
3088;;
3089; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3090;
3091; @param A0 FPU context (fxsave).
3092; @param A1 Where to return the output FSW.
3093; @param A2 Where to store the 64-bit value.
3094; @param A3 Pointer to the 80-bit value.
3095;
3096BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3097 PROLOGUE_4_ARGS
3098 sub xSP, 20h
3099
3100 fninit
3101 fld tword [A3]
3102 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3103 fst qword [A2]
3104
3105 fnstsw word [A1]
3106
3107 fninit
3108 add xSP, 20h
3109 EPILOGUE_4_ARGS
3110ENDPROC iemAImpl_fst_r80_to_r64
3111
3112
3113;;
3114; FPU instruction working on one 80-bit and one 64-bit floating point value.
3115;
3116; @param 1 The instruction
3117;
3118; @param A0 FPU context (fxsave).
3119; @param A1 Pointer to a IEMFPURESULT for the output.
3120; @param A2 Pointer to the 80-bit value.
3121; @param A3 Pointer to the 64-bit value.
3122;
3123%macro IEMIMPL_FPU_R80_BY_R64 1
3124BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3125 PROLOGUE_4_ARGS
3126 sub xSP, 20h
3127
3128 fninit
3129 fld tword [A2]
3130 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3131 %1 qword [A3]
3132
3133 fnstsw word [A1 + IEMFPURESULT.FSW]
3134 fnclex
3135 fstp tword [A1 + IEMFPURESULT.r80Result]
3136
3137 fninit
3138 add xSP, 20h
3139 EPILOGUE_4_ARGS
3140ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3141%endmacro
3142
3143IEMIMPL_FPU_R80_BY_R64 fadd
3144IEMIMPL_FPU_R80_BY_R64 fmul
3145IEMIMPL_FPU_R80_BY_R64 fsub
3146IEMIMPL_FPU_R80_BY_R64 fsubr
3147IEMIMPL_FPU_R80_BY_R64 fdiv
3148IEMIMPL_FPU_R80_BY_R64 fdivr
3149
3150;;
3151; FPU instruction working on one 80-bit and one 64-bit floating point value,
3152; only returning FSW.
3153;
3154; @param 1 The instruction
3155;
3156; @param A0 FPU context (fxsave).
3157; @param A1 Where to store the output FSW.
3158; @param A2 Pointer to the 80-bit value.
3159; @param A3 Pointer to the 64-bit value.
3160;
3161%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3162BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3163 PROLOGUE_4_ARGS
3164 sub xSP, 20h
3165
3166 fninit
3167 fld tword [A2]
3168 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3169 %1 qword [A3]
3170
3171 fnstsw word [A1]
3172
3173 fninit
3174 add xSP, 20h
3175 EPILOGUE_4_ARGS
3176ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3177%endmacro
3178
3179IEMIMPL_FPU_R80_BY_R64_FSW fcom
3180
3181
3182
3183;
3184;---------------------- 80-bit floating point operations ----------------------
3185;
3186
3187;;
3188; Loads a 80-bit floating point register value from memory.
3189;
3190; @param A0 FPU context (fxsave).
3191; @param A1 Pointer to a IEMFPURESULT for the output.
3192; @param A2 Pointer to the 80-bit floating point value to load.
3193;
3194BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3195 PROLOGUE_3_ARGS
3196 sub xSP, 20h
3197
3198 fninit
3199 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3200 fld tword [A2]
3201
3202 fnstsw word [A1 + IEMFPURESULT.FSW]
3203 fnclex
3204 fstp tword [A1 + IEMFPURESULT.r80Result]
3205
3206 fninit
3207 add xSP, 20h
3208 EPILOGUE_3_ARGS
3209ENDPROC iemAImpl_fld_r80_from_r80
3210
3211
3212;;
3213; Store a 80-bit floating point register to memory
3214;
3215; @param A0 FPU context (fxsave).
3216; @param A1 Where to return the output FSW.
3217; @param A2 Where to store the 80-bit value.
3218; @param A3 Pointer to the 80-bit register value.
3219;
3220BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3221 PROLOGUE_4_ARGS
3222 sub xSP, 20h
3223
3224 fninit
3225 fld tword [A3]
3226 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3227 fstp tword [A2]
3228
3229 fnstsw word [A1]
3230
3231 fninit
3232 add xSP, 20h
3233 EPILOGUE_4_ARGS
3234ENDPROC iemAImpl_fst_r80_to_r80
3235
3236
3237;;
3238; Loads an 80-bit floating point register value in BCD format from memory.
3239;
3240; @param A0 FPU context (fxsave).
3241; @param A1 Pointer to a IEMFPURESULT for the output.
3242; @param A2 Pointer to the 80-bit BCD value to load.
3243;
3244BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3245 PROLOGUE_3_ARGS
3246 sub xSP, 20h
3247
3248 fninit
3249 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3250 fbld tword [A2]
3251
3252 fnstsw word [A1 + IEMFPURESULT.FSW]
3253 fnclex
3254 fstp tword [A1 + IEMFPURESULT.r80Result]
3255
3256 fninit
3257 add xSP, 20h
3258 EPILOGUE_3_ARGS
3259ENDPROC iemAImpl_fld_r80_from_d80
3260
3261
3262;;
3263; Store a 80-bit floating point register to memory as BCD
3264;
3265; @param A0 FPU context (fxsave).
3266; @param A1 Where to return the output FSW.
3267; @param A2 Where to store the 80-bit BCD value.
3268; @param A3 Pointer to the 80-bit register value.
3269;
3270BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3271 PROLOGUE_4_ARGS
3272 sub xSP, 20h
3273
3274 fninit
3275 fld tword [A3]
3276 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3277 fbstp tword [A2]
3278
3279 fnstsw word [A1]
3280
3281 fninit
3282 add xSP, 20h
3283 EPILOGUE_4_ARGS
3284ENDPROC iemAImpl_fst_r80_to_d80
3285
3286
3287;;
3288; FPU instruction working on two 80-bit floating point values.
3289;
3290; @param 1 The instruction
3291;
3292; @param A0 FPU context (fxsave).
3293; @param A1 Pointer to a IEMFPURESULT for the output.
3294; @param A2 Pointer to the first 80-bit value (ST0)
3295; @param A3 Pointer to the second 80-bit value (STn).
3296;
3297%macro IEMIMPL_FPU_R80_BY_R80 2
3298BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3299 PROLOGUE_4_ARGS
3300 sub xSP, 20h
3301
3302 fninit
3303 fld tword [A3]
3304 fld tword [A2]
3305 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3306 %1 %2
3307
3308 fnstsw word [A1 + IEMFPURESULT.FSW]
3309 fnclex
3310 fstp tword [A1 + IEMFPURESULT.r80Result]
3311
3312 fninit
3313 add xSP, 20h
3314 EPILOGUE_4_ARGS
3315ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3316%endmacro
3317
3318IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3319IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3320IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3321IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3322IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3323IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3324IEMIMPL_FPU_R80_BY_R80 fprem, {}
3325IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3326IEMIMPL_FPU_R80_BY_R80 fscale, {}
3327
3328
3329;;
3330; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3331; storing the result in ST1 and popping the stack.
3332;
3333; @param 1 The instruction
3334;
3335; @param A0 FPU context (fxsave).
3336; @param A1 Pointer to a IEMFPURESULT for the output.
3337; @param A2 Pointer to the first 80-bit value (ST1).
3338; @param A3 Pointer to the second 80-bit value (ST0).
3339;
3340%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3341BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3342 PROLOGUE_4_ARGS
3343 sub xSP, 20h
3344
3345 fninit
3346 fld tword [A2]
3347 fld tword [A3]
3348 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3349 %1
3350
3351 fnstsw word [A1 + IEMFPURESULT.FSW]
3352 fnclex
3353 fstp tword [A1 + IEMFPURESULT.r80Result]
3354
3355 fninit
3356 add xSP, 20h
3357 EPILOGUE_4_ARGS
3358ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3359%endmacro
3360
3361IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3362IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3363IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3364
3365
3366;;
3367; FPU instruction working on two 80-bit floating point values, only
3368; returning FSW.
3369;
3370; @param 1 The instruction
3371;
3372; @param A0 FPU context (fxsave).
3373; @param A1 Pointer to a uint16_t for the resulting FSW.
3374; @param A2 Pointer to the first 80-bit value.
3375; @param A3 Pointer to the second 80-bit value.
3376;
3377%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3378BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3379 PROLOGUE_4_ARGS
3380 sub xSP, 20h
3381
3382 fninit
3383 fld tword [A3]
3384 fld tword [A2]
3385 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3386 %1 st0, st1
3387
3388 fnstsw word [A1]
3389
3390 fninit
3391 add xSP, 20h
3392 EPILOGUE_4_ARGS
3393ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3394%endmacro
3395
3396IEMIMPL_FPU_R80_BY_R80_FSW fcom
3397IEMIMPL_FPU_R80_BY_R80_FSW fucom
3398
3399
3400;;
3401; FPU instruction working on two 80-bit floating point values,
3402; returning FSW and EFLAGS (eax).
3403;
3404; @param 1 The instruction
3405;
3406; @returns EFLAGS in EAX.
3407; @param A0 FPU context (fxsave).
3408; @param A1 Pointer to a uint16_t for the resulting FSW.
3409; @param A2 Pointer to the first 80-bit value.
3410; @param A3 Pointer to the second 80-bit value.
3411;
3412%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3413BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3414 PROLOGUE_4_ARGS
3415 sub xSP, 20h
3416
3417 fninit
3418 fld tword [A3]
3419 fld tword [A2]
3420 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3421 %1 st1
3422
3423 fnstsw word [A1]
3424 pushf
3425 pop xAX
3426
3427 fninit
3428 add xSP, 20h
3429 EPILOGUE_4_ARGS
3430ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3431%endmacro
3432
3433IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3434IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3435
3436
3437;;
3438; FPU instruction working on one 80-bit floating point value.
3439;
3440; @param 1 The instruction
3441;
3442; @param A0 FPU context (fxsave).
3443; @param A1 Pointer to a IEMFPURESULT for the output.
3444; @param A2 Pointer to the 80-bit value.
3445;
3446%macro IEMIMPL_FPU_R80 1
3447BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3448 PROLOGUE_3_ARGS
3449 sub xSP, 20h
3450
3451 fninit
3452 fld tword [A2]
3453 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3454 %1
3455
3456 fnstsw word [A1 + IEMFPURESULT.FSW]
3457 fnclex
3458 fstp tword [A1 + IEMFPURESULT.r80Result]
3459
3460 fninit
3461 add xSP, 20h
3462 EPILOGUE_3_ARGS
3463ENDPROC iemAImpl_ %+ %1 %+ _r80
3464%endmacro
3465
3466IEMIMPL_FPU_R80 fchs
3467IEMIMPL_FPU_R80 fabs
3468IEMIMPL_FPU_R80 f2xm1
3469IEMIMPL_FPU_R80 fsqrt
3470IEMIMPL_FPU_R80 frndint
3471IEMIMPL_FPU_R80 fsin
3472IEMIMPL_FPU_R80 fcos
3473
3474
3475;;
3476; FPU instruction working on one 80-bit floating point value, only
3477; returning FSW.
3478;
3479; @param 1 The instruction
3480; @param 2 Non-zero to also restore FTW.
3481;
3482; @param A0 FPU context (fxsave).
3483; @param A1 Pointer to a uint16_t for the resulting FSW.
3484; @param A2 Pointer to the 80-bit value.
3485;
3486%macro IEMIMPL_FPU_R80_FSW 2
3487BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3488 PROLOGUE_3_ARGS
3489 sub xSP, 20h
3490
3491 fninit
3492 fld tword [A2]
3493%if %2 != 0
3494 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3495%else
3496 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3497%endif
3498 %1
3499
3500 fnstsw word [A1]
3501
3502 fninit
3503 add xSP, 20h
3504 EPILOGUE_3_ARGS
3505ENDPROC iemAImpl_ %+ %1 %+ _r80
3506%endmacro
3507
3508IEMIMPL_FPU_R80_FSW ftst, 0
3509IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3510
3511
3512
3513;;
3514; FPU instruction loading a 80-bit floating point constant.
3515;
3516; @param 1 The instruction
3517;
3518; @param A0 FPU context (fxsave).
3519; @param A1 Pointer to a IEMFPURESULT for the output.
3520;
3521%macro IEMIMPL_FPU_R80_CONST 1
3522BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3523 PROLOGUE_2_ARGS
3524 sub xSP, 20h
3525
3526 fninit
3527 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3528 %1
3529
3530 fnstsw word [A1 + IEMFPURESULT.FSW]
3531 fnclex
3532 fstp tword [A1 + IEMFPURESULT.r80Result]
3533
3534 fninit
3535 add xSP, 20h
3536 EPILOGUE_2_ARGS
3537ENDPROC iemAImpl_ %+ %1 %+
3538%endmacro
3539
3540IEMIMPL_FPU_R80_CONST fld1
3541IEMIMPL_FPU_R80_CONST fldl2t
3542IEMIMPL_FPU_R80_CONST fldl2e
3543IEMIMPL_FPU_R80_CONST fldpi
3544IEMIMPL_FPU_R80_CONST fldlg2
3545IEMIMPL_FPU_R80_CONST fldln2
3546IEMIMPL_FPU_R80_CONST fldz
3547
3548
3549;;
3550; FPU instruction working on one 80-bit floating point value, outputing two.
3551;
3552; @param 1 The instruction
3553;
3554; @param A0 FPU context (fxsave).
3555; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3556; @param A2 Pointer to the 80-bit value.
3557;
3558%macro IEMIMPL_FPU_R80_R80 1
3559BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3560 PROLOGUE_3_ARGS
3561 sub xSP, 20h
3562
3563 fninit
3564 fld tword [A2]
3565 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3566 %1
3567
3568 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3569 fnclex
3570 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3571 fnclex
3572 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3573
3574 fninit
3575 add xSP, 20h
3576 EPILOGUE_3_ARGS
3577ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3578%endmacro
3579
3580IEMIMPL_FPU_R80_R80 fptan
3581IEMIMPL_FPU_R80_R80 fxtract
3582IEMIMPL_FPU_R80_R80 fsincos
3583
3584
3585
3586
3587;---------------------- SSE and MMX Operations ----------------------
3588
3589;; @todo what do we need to do for MMX?
3590%macro IEMIMPL_MMX_PROLOGUE 0
3591%endmacro
3592%macro IEMIMPL_MMX_EPILOGUE 0
3593%endmacro
3594
3595;; @todo what do we need to do for SSE?
3596%macro IEMIMPL_SSE_PROLOGUE 0
3597%endmacro
3598%macro IEMIMPL_SSE_EPILOGUE 0
3599%endmacro
3600
3601;; @todo what do we need to do for AVX?
3602%macro IEMIMPL_AVX_PROLOGUE 0
3603%endmacro
3604%macro IEMIMPL_AVX_EPILOGUE 0
3605%endmacro
3606
3607
3608;;
3609; Media instruction working on two full sized registers.
3610;
3611; @param 1 The instruction
3612; @param 2 Whether there is an MMX variant (1) or not (0).
3613;
3614; @param A0 FPU context (fxsave).
3615; @param A1 Pointer to the first media register size operand (input/output).
3616; @param A2 Pointer to the second media register size operand (input).
3617;
3618%macro IEMIMPL_MEDIA_F2 2
3619%if %2 != 0
3620BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3621 PROLOGUE_3_ARGS
3622 IEMIMPL_MMX_PROLOGUE
3623
3624 movq mm0, [A1]
3625 movq mm1, [A2]
3626 %1 mm0, mm1
3627 movq [A1], mm0
3628
3629 IEMIMPL_MMX_EPILOGUE
3630 EPILOGUE_3_ARGS
3631ENDPROC iemAImpl_ %+ %1 %+ _u64
3632%endif
3633
3634BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3635 PROLOGUE_3_ARGS
3636 IEMIMPL_SSE_PROLOGUE
3637
3638 movdqu xmm0, [A1]
3639 movdqu xmm1, [A2]
3640 %1 xmm0, xmm1
3641 movdqu [A1], xmm0
3642
3643 IEMIMPL_SSE_EPILOGUE
3644 EPILOGUE_3_ARGS
3645ENDPROC iemAImpl_ %+ %1 %+ _u128
3646%endmacro
3647
3648IEMIMPL_MEDIA_F2 pshufb, 1
3649IEMIMPL_MEDIA_F2 pand, 1
3650IEMIMPL_MEDIA_F2 pandn, 1
3651IEMIMPL_MEDIA_F2 por, 1
3652IEMIMPL_MEDIA_F2 pxor, 1
3653IEMIMPL_MEDIA_F2 pcmpeqb, 1
3654IEMIMPL_MEDIA_F2 pcmpeqw, 1
3655IEMIMPL_MEDIA_F2 pcmpeqd, 1
3656IEMIMPL_MEDIA_F2 pcmpeqq, 0
3657IEMIMPL_MEDIA_F2 pcmpgtb, 1
3658IEMIMPL_MEDIA_F2 pcmpgtw, 1
3659IEMIMPL_MEDIA_F2 pcmpgtd, 1
3660IEMIMPL_MEDIA_F2 pcmpgtq, 0
3661IEMIMPL_MEDIA_F2 paddb, 1
3662IEMIMPL_MEDIA_F2 paddw, 1
3663IEMIMPL_MEDIA_F2 paddd, 1
3664IEMIMPL_MEDIA_F2 paddq, 1
3665IEMIMPL_MEDIA_F2 paddsb, 1
3666IEMIMPL_MEDIA_F2 paddsw, 1
3667IEMIMPL_MEDIA_F2 paddusb, 1
3668IEMIMPL_MEDIA_F2 paddusw, 1
3669IEMIMPL_MEDIA_F2 psubb, 1
3670IEMIMPL_MEDIA_F2 psubw, 1
3671IEMIMPL_MEDIA_F2 psubd, 1
3672IEMIMPL_MEDIA_F2 psubq, 1
3673IEMIMPL_MEDIA_F2 psubsb, 1
3674IEMIMPL_MEDIA_F2 psubsw, 1
3675IEMIMPL_MEDIA_F2 psubusb, 1
3676IEMIMPL_MEDIA_F2 psubusw, 1
3677IEMIMPL_MEDIA_F2 pmullw, 1
3678IEMIMPL_MEDIA_F2 pmulld, 0
3679IEMIMPL_MEDIA_F2 pmulhw, 1
3680IEMIMPL_MEDIA_F2 pmaddwd, 1
3681IEMIMPL_MEDIA_F2 pminub, 1
3682IEMIMPL_MEDIA_F2 pminuw, 0
3683IEMIMPL_MEDIA_F2 pminud, 0
3684IEMIMPL_MEDIA_F2 pminsb, 0
3685IEMIMPL_MEDIA_F2 pminsw, 1
3686IEMIMPL_MEDIA_F2 pminsd, 0
3687IEMIMPL_MEDIA_F2 pmaxub, 1
3688IEMIMPL_MEDIA_F2 pmaxuw, 0
3689IEMIMPL_MEDIA_F2 pmaxud, 0
3690IEMIMPL_MEDIA_F2 pmaxsb, 0
3691IEMIMPL_MEDIA_F2 pmaxsw, 1
3692IEMIMPL_MEDIA_F2 pmaxsd, 0
3693IEMIMPL_MEDIA_F2 pabsb, 1
3694IEMIMPL_MEDIA_F2 pabsw, 1
3695IEMIMPL_MEDIA_F2 pabsd, 1
3696IEMIMPL_MEDIA_F2 psignb, 1
3697IEMIMPL_MEDIA_F2 psignw, 1
3698IEMIMPL_MEDIA_F2 psignd, 1
3699IEMIMPL_MEDIA_F2 phaddw, 1
3700IEMIMPL_MEDIA_F2 phaddd, 1
3701IEMIMPL_MEDIA_F2 phsubw, 1
3702IEMIMPL_MEDIA_F2 phsubd, 1
3703IEMIMPL_MEDIA_F2 phaddsw, 1
3704IEMIMPL_MEDIA_F2 phsubsw, 1
3705IEMIMPL_MEDIA_F2 pmaddubsw, 1
3706IEMIMPL_MEDIA_F2 pmulhrsw, 1
3707IEMIMPL_MEDIA_F2 pmuludq, 1
3708
3709
3710;;
3711; Media instruction working on two full sized registers, but no FXSAVE state argument.
3712;
3713; @param 1 The instruction
3714; @param 2 Whether there is an MMX variant (1) or not (0).
3715;
3716; @param A0 Pointer to the first media register size operand (input/output).
3717; @param A1 Pointer to the second media register size operand (input).
3718;
3719%macro IEMIMPL_MEDIA_OPT_F2 2
3720%if %2 != 0
3721BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3722 PROLOGUE_2_ARGS
3723 IEMIMPL_MMX_PROLOGUE
3724
3725 movq mm0, [A0]
3726 movq mm1, [A1]
3727 %1 mm0, mm1
3728 movq [A0], mm0
3729
3730 IEMIMPL_MMX_EPILOGUE
3731 EPILOGUE_2_ARGS
3732ENDPROC iemAImpl_ %+ %1 %+ _u64
3733%endif
3734
3735BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3736 PROLOGUE_2_ARGS
3737 IEMIMPL_SSE_PROLOGUE
3738
3739 movdqu xmm0, [A0]
3740 movdqu xmm1, [A1]
3741 %1 xmm0, xmm1
3742 movdqu [A0], xmm0
3743
3744 IEMIMPL_SSE_EPILOGUE
3745 EPILOGUE_2_ARGS
3746ENDPROC iemAImpl_ %+ %1 %+ _u128
3747%endmacro
3748
3749IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3750IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3751IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3752IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3753IEMIMPL_MEDIA_OPT_F2 psllw, 1
3754IEMIMPL_MEDIA_OPT_F2 pslld, 1
3755IEMIMPL_MEDIA_OPT_F2 psllq, 1
3756IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3757IEMIMPL_MEDIA_OPT_F2 psrld, 1
3758IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3759IEMIMPL_MEDIA_OPT_F2 psraw, 1
3760IEMIMPL_MEDIA_OPT_F2 psrad, 1
3761IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3762IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3763IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3764IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3765IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3766IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3767IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3768IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3769IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3770IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3771IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3772IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3773IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3774IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3775IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3776IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3777IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3778IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3779IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3780IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3781
3782;;
3783; Media instruction working on one full sized and one half sized register (lower half).
3784;
3785; @param 1 The instruction
3786; @param 2 1 if MMX is included, 0 if not.
3787;
3788; @param A0 Pointer to the first full sized media register operand (input/output).
3789; @param A1 Pointer to the second half sized media register operand (input).
3790;
3791%macro IEMIMPL_MEDIA_F1L1 2
3792 %if %2 != 0
3793BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3794 PROLOGUE_2_ARGS
3795 IEMIMPL_MMX_PROLOGUE
3796
3797 movq mm0, [A0]
3798 movq mm1, [A1]
3799 %1 mm0, mm1
3800 movq [A0], mm0
3801
3802 IEMIMPL_MMX_EPILOGUE
3803 EPILOGUE_2_ARGS
3804ENDPROC iemAImpl_ %+ %1 %+ _u64
3805 %endif
3806
3807BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3808 PROLOGUE_2_ARGS
3809 IEMIMPL_SSE_PROLOGUE
3810
3811 movdqu xmm0, [A0]
3812 movdqu xmm1, [A1]
3813 %1 xmm0, xmm1
3814 movdqu [A0], xmm0
3815
3816 IEMIMPL_SSE_EPILOGUE
3817 EPILOGUE_2_ARGS
3818ENDPROC iemAImpl_ %+ %1 %+ _u128
3819%endmacro
3820
3821IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3822IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3823IEMIMPL_MEDIA_F1L1 punpckldq, 1
3824IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3825
3826
3827;;
3828; Media instruction working two half sized input registers (lower half) and a full sized
3829; destination register (vpunpckh*).
3830;
3831; @param 1 The instruction
3832;
3833; @param A0 Pointer to the destination register (full sized, output only).
3834; @param A1 Pointer to the first full sized media source register operand, where we
3835; will only use the lower half as input - but we'll be loading it in full.
3836; @param A2 Pointer to the second full sized media source register operand, where we
3837; will only use the lower half as input - but we'll be loading it in full.
3838;
3839%macro IEMIMPL_MEDIA_F1L1L1 1
3840BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3841 PROLOGUE_3_ARGS
3842 IEMIMPL_AVX_PROLOGUE
3843
3844 vmovdqu xmm0, [A1]
3845 vmovdqu xmm1, [A2]
3846 %1 xmm0, xmm0, xmm1
3847 vmovdqu [A0], xmm0
3848
3849 IEMIMPL_AVX_PROLOGUE
3850 EPILOGUE_3_ARGS
3851ENDPROC iemAImpl_ %+ %1 %+ _u128
3852
3853BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3854 PROLOGUE_3_ARGS
3855 IEMIMPL_AVX_PROLOGUE
3856
3857 vmovdqu ymm0, [A1]
3858 vmovdqu ymm1, [A2]
3859 %1 ymm0, ymm0, ymm1
3860 vmovdqu [A0], ymm0
3861
3862 IEMIMPL_AVX_PROLOGUE
3863 EPILOGUE_3_ARGS
3864ENDPROC iemAImpl_ %+ %1 %+ _u256
3865%endmacro
3866
3867IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3868IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3869IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3870IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3871
3872
3873;;
3874; Media instruction working on one full sized and one half sized register (high half).
3875;
3876; @param 1 The instruction
3877; @param 2 1 if MMX is included, 0 if not.
3878;
3879; @param A0 Pointer to the first full sized media register operand (input/output).
3880; @param A1 Pointer to the second full sized media register operand, where we
3881; will only use the upper half as input - but we'll load it in full.
3882;
3883%macro IEMIMPL_MEDIA_F1H1 2
3884IEMIMPL_MEDIA_F1L1 %1, %2
3885%endmacro
3886
3887IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3888IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3889IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3890IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3891
3892
3893;;
3894; Media instruction working two half sized input registers (high half) and a full sized
3895; destination register (vpunpckh*).
3896;
3897; @param 1 The instruction
3898;
3899; @param A0 Pointer to the destination register (full sized, output only).
3900; @param A1 Pointer to the first full sized media source register operand, where we
3901; will only use the upper half as input - but we'll be loading it in full.
3902; @param A2 Pointer to the second full sized media source register operand, where we
3903; will only use the upper half as input - but we'll be loading it in full.
3904;
3905%macro IEMIMPL_MEDIA_F1H1H1 1
3906IEMIMPL_MEDIA_F1L1L1 %1
3907%endmacro
3908
3909IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3910IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3911IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3912IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3913
3914
3915;
3916; Shufflers with evil 8-bit immediates.
3917;
3918
3919BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3920 PROLOGUE_3_ARGS
3921 IEMIMPL_MMX_PROLOGUE
3922
3923 movq mm1, [A1]
3924 movq mm0, mm0 ; paranoia!
3925 lea T1, [.imm0 xWrtRIP]
3926 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3927 lea T0, [A2 + A2*8] ; sizeof(pshufw+ret) == 9
3928 %else
3929 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3930 %endif
3931 lea T1, [T1 + T0]
3932 IBT_NOTRACK
3933 call T1
3934 movq [A0], mm0
3935
3936 IEMIMPL_MMX_EPILOGUE
3937 EPILOGUE_3_ARGS
3938%assign bImm 0
3939%rep 256
3940.imm %+ bImm:
3941 IBT_ENDBRxx_WITHOUT_NOTRACK
3942 pshufw mm0, mm1, bImm
3943 ret
3944 %assign bImm bImm + 1
3945%endrep
3946.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
3947ENDPROC iemAImpl_pshufw_u64
3948
3949
3950%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3951BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3952 PROLOGUE_3_ARGS
3953 IEMIMPL_SSE_PROLOGUE
3954
3955 movdqu xmm1, [A1]
3956 movdqu xmm0, xmm1 ; paranoia!
3957 lea T1, [.imm0 xWrtRIP]
3958 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3959 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
3960 %else
3961 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
3962 %endif
3963 lea T1, [T1 + T0*2]
3964 IBT_NOTRACK
3965 call T1
3966 movdqu [A0], xmm0
3967
3968 IEMIMPL_SSE_EPILOGUE
3969 EPILOGUE_3_ARGS
3970
3971 %assign bImm 0
3972 %rep 256
3973.imm %+ bImm:
3974 IBT_ENDBRxx_WITHOUT_NOTRACK
3975 %1 xmm0, xmm1, bImm
3976 ret
3977 %assign bImm bImm + 1
3978 %endrep
3979.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
3980ENDPROC iemAImpl_ %+ %1 %+ _u128
3981%endmacro
3982
3983IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3984IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3985IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3986
3987
3988%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3989BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3990 PROLOGUE_3_ARGS
3991 IEMIMPL_SSE_PROLOGUE
3992
3993 vmovdqu ymm1, [A1]
3994 vmovdqu ymm0, ymm1 ; paranoia!
3995 lea T1, [.imm0 xWrtRIP]
3996 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
3997 lea T0, [A2 + A2*4] ; sizeof(pshufXX+ret) == 10: A2 * 10 = (A2 * 5) * 2
3998 %else
3999 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4000 %endif
4001 lea T1, [T1 + T0*2]
4002 IBT_NOTRACK
4003 call T1
4004 vmovdqu [A0], ymm0
4005
4006 IEMIMPL_SSE_EPILOGUE
4007 EPILOGUE_3_ARGS
4008 %assign bImm 0
4009 %rep 256
4010.imm %+ bImm:
4011 IBT_ENDBRxx_WITHOUT_NOTRACK
4012 %1 ymm0, ymm1, bImm
4013 ret
4014 %assign bImm bImm + 1
4015 %endrep
4016.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4017ENDPROC iemAImpl_ %+ %1 %+ _u256
4018%endmacro
4019
4020IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
4021IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
4022IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
4023
4024
4025;
4026; Shifts with evil 8-bit immediates.
4027;
4028
4029%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4030BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4031 PROLOGUE_2_ARGS
4032 IEMIMPL_MMX_PROLOGUE
4033
4034 movq mm0, [A0]
4035 lea T1, [.imm0 xWrtRIP]
4036 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4037 lea T0, [A1 + A1*8] ; sizeof(psXX+ret) == 9
4038 %else
4039 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4040 %endif
4041 lea T1, [T1 + T0]
4042 IBT_NOTRACK
4043 call T1
4044 movq [A0], mm0
4045
4046 IEMIMPL_MMX_EPILOGUE
4047 EPILOGUE_2_ARGS
4048%assign bImm 0
4049%rep 256
4050.imm %+ bImm:
4051 IBT_ENDBRxx_WITHOUT_NOTRACK
4052 %1 mm0, bImm
4053 ret
4054 %assign bImm bImm + 1
4055%endrep
4056.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
4057ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4058%endmacro
4059
4060IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4061IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4062IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4063IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4064IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4065IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4066IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4067IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4068
4069
4070%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4071BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4072 PROLOGUE_2_ARGS
4073 IEMIMPL_SSE_PROLOGUE
4074
4075 movdqu xmm0, [A0]
4076 lea T1, [.imm0 xWrtRIP]
4077 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4078 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 10: A1 * 10 = (A1 * 5) * 2
4079 %else
4080 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: A1 * 6 = (A1 * 3) * 2
4081 %endif
4082 lea T1, [T1 + T0*2]
4083 IBT_NOTRACK
4084 call T1
4085 movdqu [A0], xmm0
4086
4087 IEMIMPL_SSE_EPILOGUE
4088 EPILOGUE_2_ARGS
4089 %assign bImm 0
4090 %rep 256
4091.imm %+ bImm:
4092 IBT_ENDBRxx_WITHOUT_NOTRACK
4093 %1 xmm0, bImm
4094 ret
4095 %assign bImm bImm + 1
4096 %endrep
4097.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4098ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4099%endmacro
4100
4101IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4102IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4103IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4104IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4105IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4106IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4107IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4108IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4109IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4110IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4111
4112
4113;
4114; Move byte mask.
4115;
4116
4117BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4118 PROLOGUE_2_ARGS
4119 IEMIMPL_MMX_PROLOGUE
4120
4121 movq mm1, [A1]
4122 pmovmskb T0, mm1
4123 mov [A0], T0
4124%ifdef RT_ARCH_X86
4125 mov dword [A0 + 4], 0
4126%endif
4127 IEMIMPL_MMX_EPILOGUE
4128 EPILOGUE_2_ARGS
4129ENDPROC iemAImpl_pmovmskb_u64
4130
4131BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4132 PROLOGUE_2_ARGS
4133 IEMIMPL_SSE_PROLOGUE
4134
4135 movdqu xmm1, [A1]
4136 pmovmskb T0, xmm1
4137 mov [A0], T0
4138%ifdef RT_ARCH_X86
4139 mov dword [A0 + 4], 0
4140%endif
4141 IEMIMPL_SSE_EPILOGUE
4142 EPILOGUE_2_ARGS
4143ENDPROC iemAImpl_pmovmskb_u128
4144
4145BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4146 PROLOGUE_2_ARGS
4147 IEMIMPL_AVX_PROLOGUE
4148
4149 vmovdqu ymm1, [A1]
4150 vpmovmskb T0, ymm1
4151 mov [A0], T0
4152%ifdef RT_ARCH_X86
4153 mov dword [A0 + 4], 0
4154%endif
4155 IEMIMPL_AVX_EPILOGUE
4156 EPILOGUE_2_ARGS
4157ENDPROC iemAImpl_vpmovmskb_u256
4158
4159
4160;;
4161; Media instruction working on two full sized source registers and one destination (AVX).
4162;
4163; @param 1 The instruction
4164;
4165; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4166; @param A1 Pointer to the destination media register size operand (output).
4167; @param A2 Pointer to the first source media register size operand (input).
4168; @param A3 Pointer to the second source media register size operand (input).
4169;
4170%macro IEMIMPL_MEDIA_F3 1
4171BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4172 PROLOGUE_4_ARGS
4173 IEMIMPL_AVX_PROLOGUE
4174
4175 vmovdqu xmm0, [A2]
4176 vmovdqu xmm1, [A3]
4177 %1 xmm0, xmm0, xmm1
4178 vmovdqu [A1], xmm0
4179
4180 IEMIMPL_AVX_PROLOGUE
4181 EPILOGUE_4_ARGS
4182ENDPROC iemAImpl_ %+ %1 %+ _u128
4183
4184BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4185 PROLOGUE_4_ARGS
4186 IEMIMPL_AVX_PROLOGUE
4187
4188 vmovdqu ymm0, [A2]
4189 vmovdqu ymm1, [A3]
4190 %1 ymm0, ymm0, ymm1
4191 vmovdqu [A1], ymm0
4192
4193 IEMIMPL_AVX_PROLOGUE
4194 EPILOGUE_4_ARGS
4195ENDPROC iemAImpl_ %+ %1 %+ _u256
4196%endmacro
4197
4198IEMIMPL_MEDIA_F3 vpshufb
4199IEMIMPL_MEDIA_F3 vpand
4200IEMIMPL_MEDIA_F3 vpminub
4201IEMIMPL_MEDIA_F3 vpminuw
4202IEMIMPL_MEDIA_F3 vpminud
4203IEMIMPL_MEDIA_F3 vpminsb
4204IEMIMPL_MEDIA_F3 vpminsw
4205IEMIMPL_MEDIA_F3 vpminsd
4206IEMIMPL_MEDIA_F3 vpmaxub
4207IEMIMPL_MEDIA_F3 vpmaxuw
4208IEMIMPL_MEDIA_F3 vpmaxud
4209IEMIMPL_MEDIA_F3 vpmaxsb
4210IEMIMPL_MEDIA_F3 vpmaxsw
4211IEMIMPL_MEDIA_F3 vpmaxsd
4212IEMIMPL_MEDIA_F3 vpandn
4213IEMIMPL_MEDIA_F3 vpor
4214IEMIMPL_MEDIA_F3 vpxor
4215IEMIMPL_MEDIA_F3 vpcmpeqb
4216IEMIMPL_MEDIA_F3 vpcmpeqw
4217IEMIMPL_MEDIA_F3 vpcmpeqd
4218IEMIMPL_MEDIA_F3 vpcmpeqq
4219IEMIMPL_MEDIA_F3 vpcmpgtb
4220IEMIMPL_MEDIA_F3 vpcmpgtw
4221IEMIMPL_MEDIA_F3 vpcmpgtd
4222IEMIMPL_MEDIA_F3 vpcmpgtq
4223IEMIMPL_MEDIA_F3 vpaddb
4224IEMIMPL_MEDIA_F3 vpaddw
4225IEMIMPL_MEDIA_F3 vpaddd
4226IEMIMPL_MEDIA_F3 vpaddq
4227IEMIMPL_MEDIA_F3 vpsubb
4228IEMIMPL_MEDIA_F3 vpsubw
4229IEMIMPL_MEDIA_F3 vpsubd
4230IEMIMPL_MEDIA_F3 vpsubq
4231
4232
4233;;
4234; Media instruction working on two full sized source registers and one destination (AVX),
4235; but no XSAVE state pointer argument.
4236;
4237; @param 1 The instruction
4238;
4239; @param A0 Pointer to the destination media register size operand (output).
4240; @param A1 Pointer to the first source media register size operand (input).
4241; @param A2 Pointer to the second source media register size operand (input).
4242;
4243%macro IEMIMPL_MEDIA_OPT_F3 1
4244BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4245 PROLOGUE_3_ARGS
4246 IEMIMPL_AVX_PROLOGUE
4247
4248 vmovdqu xmm0, [A1]
4249 vmovdqu xmm1, [A2]
4250 %1 xmm0, xmm0, xmm1
4251 vmovdqu [A0], xmm0
4252
4253 IEMIMPL_AVX_PROLOGUE
4254 EPILOGUE_3_ARGS
4255ENDPROC iemAImpl_ %+ %1 %+ _u128
4256
4257BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4258 PROLOGUE_3_ARGS
4259 IEMIMPL_AVX_PROLOGUE
4260
4261 vmovdqu ymm0, [A1]
4262 vmovdqu ymm1, [A2]
4263 %1 ymm0, ymm0, ymm1
4264 vmovdqu [A0], ymm0
4265
4266 IEMIMPL_AVX_PROLOGUE
4267 EPILOGUE_3_ARGS
4268ENDPROC iemAImpl_ %+ %1 %+ _u256
4269%endmacro
4270
4271IEMIMPL_MEDIA_OPT_F3 vpacksswb
4272IEMIMPL_MEDIA_OPT_F3 vpackssdw
4273IEMIMPL_MEDIA_OPT_F3 vpackuswb
4274IEMIMPL_MEDIA_OPT_F3 vpackusdw
4275IEMIMPL_MEDIA_OPT_F3 vpmullw
4276IEMIMPL_MEDIA_OPT_F3 vpmulld
4277IEMIMPL_MEDIA_OPT_F3 vpmulhw
4278IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4279IEMIMPL_MEDIA_OPT_F3 vpavgb
4280IEMIMPL_MEDIA_OPT_F3 vpavgw
4281IEMIMPL_MEDIA_OPT_F3 vpsignb
4282IEMIMPL_MEDIA_OPT_F3 vpsignw
4283IEMIMPL_MEDIA_OPT_F3 vpsignd
4284IEMIMPL_MEDIA_OPT_F3 vphaddw
4285IEMIMPL_MEDIA_OPT_F3 vphaddd
4286IEMIMPL_MEDIA_OPT_F3 vphsubw
4287IEMIMPL_MEDIA_OPT_F3 vphsubd
4288IEMIMPL_MEDIA_OPT_F3 vphaddsw
4289IEMIMPL_MEDIA_OPT_F3 vphsubsw
4290IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4291IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4292IEMIMPL_MEDIA_OPT_F3 vpsadbw
4293IEMIMPL_MEDIA_OPT_F3 vpmuldq
4294IEMIMPL_MEDIA_OPT_F3 vpmuludq
4295IEMIMPL_MEDIA_OPT_F3 vunpcklps
4296IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4297IEMIMPL_MEDIA_OPT_F3 vunpckhps
4298IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4299IEMIMPL_MEDIA_OPT_F3 vpsubsb
4300IEMIMPL_MEDIA_OPT_F3 vpsubsw
4301IEMIMPL_MEDIA_OPT_F3 vpsubusb
4302IEMIMPL_MEDIA_OPT_F3 vpsubusw
4303IEMIMPL_MEDIA_OPT_F3 vpaddusb
4304IEMIMPL_MEDIA_OPT_F3 vpaddusw
4305IEMIMPL_MEDIA_OPT_F3 vpaddsb
4306IEMIMPL_MEDIA_OPT_F3 vpaddsw
4307
4308
4309;;
4310; Media instruction working on one full sized source registers and one destination (AVX),
4311; but no XSAVE state pointer argument.
4312;
4313; @param 1 The instruction
4314; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4315;
4316; @param A0 Pointer to the destination media register size operand (output).
4317; @param A1 Pointer to the source media register size operand (input).
4318;
4319%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4320BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4321 PROLOGUE_2_ARGS
4322 IEMIMPL_AVX_PROLOGUE
4323
4324 vmovdqu xmm0, [A1]
4325 %1 xmm0, xmm0
4326 vmovdqu [A0], xmm0
4327
4328 IEMIMPL_AVX_PROLOGUE
4329 EPILOGUE_2_ARGS
4330ENDPROC iemAImpl_ %+ %1 %+ _u128
4331
4332 %if %2 == 1
4333BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4334 PROLOGUE_2_ARGS
4335 IEMIMPL_AVX_PROLOGUE
4336
4337 vmovdqu ymm0, [A1]
4338 %1 ymm0, ymm0
4339 vmovdqu [A0], ymm0
4340
4341 IEMIMPL_AVX_PROLOGUE
4342 EPILOGUE_2_ARGS
4343ENDPROC iemAImpl_ %+ %1 %+ _u256
4344 %endif
4345%endmacro
4346
4347IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4348IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4349IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4350IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4351
4352
4353;
4354; The SSE 4.2 crc32
4355;
4356; @param A1 Pointer to the 32-bit destination.
4357; @param A2 The source operand, sized according to the suffix.
4358;
4359BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4360 PROLOGUE_2_ARGS
4361
4362 mov T0_32, [A0]
4363 crc32 T0_32, A1_8
4364 mov [A0], T0_32
4365
4366 EPILOGUE_2_ARGS
4367ENDPROC iemAImpl_crc32_u8
4368
4369BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4370 PROLOGUE_2_ARGS
4371
4372 mov T0_32, [A0]
4373 crc32 T0_32, A1_16
4374 mov [A0], T0_32
4375
4376 EPILOGUE_2_ARGS
4377ENDPROC iemAImpl_crc32_u16
4378
4379BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4380 PROLOGUE_2_ARGS
4381
4382 mov T0_32, [A0]
4383 crc32 T0_32, A1_32
4384 mov [A0], T0_32
4385
4386 EPILOGUE_2_ARGS
4387ENDPROC iemAImpl_crc32_u32
4388
4389%ifdef RT_ARCH_AMD64
4390BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4391 PROLOGUE_2_ARGS
4392
4393 mov T0_32, [A0]
4394 crc32 T0, A1
4395 mov [A0], T0_32
4396
4397 EPILOGUE_2_ARGS
4398ENDPROC iemAImpl_crc32_u64
4399%endif
4400
4401
4402;
4403; PTEST (SSE 4.1)
4404;
4405; @param A0 Pointer to the first source operand (aka readonly destination).
4406; @param A1 Pointer to the second source operand.
4407; @param A2 Pointer to the EFLAGS register.
4408;
4409BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4410 PROLOGUE_3_ARGS
4411 IEMIMPL_SSE_PROLOGUE
4412
4413 movdqu xmm0, [A0]
4414 movdqu xmm1, [A1]
4415 ptest xmm0, xmm1
4416 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4417
4418 IEMIMPL_SSE_EPILOGUE
4419 EPILOGUE_3_ARGS
4420ENDPROC iemAImpl_ptest_u128
4421
4422BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4423 PROLOGUE_3_ARGS
4424 IEMIMPL_SSE_PROLOGUE
4425
4426 vmovdqu ymm0, [A0]
4427 vmovdqu ymm1, [A1]
4428 vptest ymm0, ymm1
4429 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4430
4431 IEMIMPL_SSE_EPILOGUE
4432 EPILOGUE_3_ARGS
4433ENDPROC iemAImpl_vptest_u256
4434
4435
4436;;
4437; Template for the [v]pmov{s,z}x* instructions
4438;
4439; @param 1 The instruction
4440;
4441; @param A0 Pointer to the destination media register size operand (output).
4442; @param A1 The source operand value (input).
4443;
4444%macro IEMIMPL_V_PMOV_SZ_X 1
4445BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4446 PROLOGUE_2_ARGS
4447 IEMIMPL_SSE_PROLOGUE
4448
4449 movd xmm0, A1
4450 %1 xmm0, xmm0
4451 vmovdqu [A0], xmm0
4452
4453 IEMIMPL_SSE_PROLOGUE
4454 EPILOGUE_2_ARGS
4455ENDPROC iemAImpl_ %+ %1 %+ _u128
4456
4457BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4458 PROLOGUE_2_ARGS
4459 IEMIMPL_AVX_PROLOGUE
4460
4461 movd xmm0, A1
4462 v %+ %1 xmm0, xmm0
4463 vmovdqu [A0], xmm0
4464
4465 IEMIMPL_AVX_PROLOGUE
4466 EPILOGUE_2_ARGS
4467ENDPROC iemAImpl_v %+ %1 %+ _u128
4468
4469BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4470 PROLOGUE_2_ARGS
4471 IEMIMPL_AVX_PROLOGUE
4472
4473 movdqu xmm0, [A1]
4474 v %+ %1 ymm0, xmm0
4475 vmovdqu [A0], ymm0
4476
4477 IEMIMPL_AVX_PROLOGUE
4478 EPILOGUE_2_ARGS
4479ENDPROC iemAImpl_v %+ %1 %+ _u256
4480%endmacro
4481
4482IEMIMPL_V_PMOV_SZ_X pmovsxbw
4483IEMIMPL_V_PMOV_SZ_X pmovsxbd
4484IEMIMPL_V_PMOV_SZ_X pmovsxbq
4485IEMIMPL_V_PMOV_SZ_X pmovsxwd
4486IEMIMPL_V_PMOV_SZ_X pmovsxwq
4487IEMIMPL_V_PMOV_SZ_X pmovsxdq
4488
4489IEMIMPL_V_PMOV_SZ_X pmovzxbw
4490IEMIMPL_V_PMOV_SZ_X pmovzxbd
4491IEMIMPL_V_PMOV_SZ_X pmovzxbq
4492IEMIMPL_V_PMOV_SZ_X pmovzxwd
4493IEMIMPL_V_PMOV_SZ_X pmovzxwq
4494IEMIMPL_V_PMOV_SZ_X pmovzxdq
4495
4496
4497;;
4498; Need to move this as well somewhere better?
4499;
4500struc IEMSSERESULT
4501 .uResult resd 4
4502 .MXCSR resd 1
4503endstruc
4504
4505
4506;;
4507; Need to move this as well somewhere better?
4508;
4509struc IEMAVX128RESULT
4510 .uResult resd 4
4511 .MXCSR resd 1
4512endstruc
4513
4514
4515;;
4516; Need to move this as well somewhere better?
4517;
4518struc IEMAVX256RESULT
4519 .uResult resd 8
4520 .MXCSR resd 1
4521endstruc
4522
4523
4524;;
4525; Initialize the SSE MXCSR register using the guest value partially to
4526; account for rounding mode.
4527;
4528; @uses 4 bytes of stack to save the original value, T0.
4529; @param 1 Expression giving the address of the FXSTATE of the guest.
4530;
4531%macro SSE_LD_FXSTATE_MXCSR 1
4532 sub xSP, 4
4533
4534 stmxcsr [xSP]
4535 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4536 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4537 or T0_32, X86_MXCSR_XCPT_MASK
4538 sub xSP, 4
4539 mov [xSP], T0_32
4540 ldmxcsr [xSP]
4541 add xSP, 4
4542%endmacro
4543
4544
4545;;
4546; Restores the SSE MXCSR register with the original value.
4547;
4548; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4549; @param 1 Expression giving the address where to return the MXCSR value.
4550; @param 2 Expression giving the address of the FXSTATE of the guest.
4551;
4552; @note Restores the stack pointer.
4553;
4554%macro SSE_ST_FXSTATE_MXCSR 2
4555 sub xSP, 4
4556 stmxcsr [xSP]
4557 mov T0_32, [xSP]
4558 add xSP, 4
4559 ; Merge the status bits into the original MXCSR value.
4560 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4561 and T0_32, X86_MXCSR_XCPT_FLAGS
4562 or T0_32, T1_32
4563 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4564
4565 ldmxcsr [xSP]
4566 add xSP, 4
4567%endmacro
4568
4569
4570;;
4571; Initialize the SSE MXCSR register using the guest value partially to
4572; account for rounding mode.
4573;
4574; @uses 4 bytes of stack to save the original value.
4575; @param 1 Expression giving the address of the FXSTATE of the guest.
4576;
4577%macro AVX_LD_XSAVEAREA_MXCSR 1
4578 sub xSP, 4
4579
4580 stmxcsr [xSP]
4581 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4582 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4583 sub xSP, 4
4584 mov [xSP], T0_32
4585 ldmxcsr [xSP]
4586 add xSP, 4
4587%endmacro
4588
4589
4590;;
4591; Restores the AVX128 MXCSR register with the original value.
4592;
4593; @param 1 Expression giving the address where to return the MXCSR value.
4594;
4595; @note Restores the stack pointer.
4596;
4597%macro AVX128_ST_XSAVEAREA_MXCSR 1
4598 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4599
4600 ldmxcsr [xSP]
4601 add xSP, 4
4602%endmacro
4603
4604
4605;;
4606; Restores the AVX256 MXCSR register with the original value.
4607;
4608; @param 1 Expression giving the address where to return the MXCSR value.
4609;
4610; @note Restores the stack pointer.
4611;
4612%macro AVX256_ST_XSAVEAREA_MXCSR 1
4613 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4614
4615 ldmxcsr [xSP]
4616 add xSP, 4
4617%endmacro
4618
4619
4620;;
4621; Floating point instruction working on two full sized registers.
4622;
4623; @param 1 The instruction
4624; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4625;
4626; @param A0 FPU context (FXSTATE or XSAVEAREA).
4627; @param A1 Where to return the result including the MXCSR value.
4628; @param A2 Pointer to the first media register size operand (input/output).
4629; @param A3 Pointer to the second media register size operand (input).
4630;
4631%macro IEMIMPL_FP_F2 2
4632BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4633 PROLOGUE_4_ARGS
4634 IEMIMPL_SSE_PROLOGUE
4635 SSE_LD_FXSTATE_MXCSR A0
4636
4637 movdqu xmm0, [A2]
4638 movdqu xmm1, [A3]
4639 %1 xmm0, xmm1
4640 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4641
4642 SSE_ST_FXSTATE_MXCSR A1, A0
4643 IEMIMPL_SSE_PROLOGUE
4644 EPILOGUE_4_ARGS
4645ENDPROC iemAImpl_ %+ %1 %+ _u128
4646
4647 %if %2 == 3
4648BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4649 PROLOGUE_4_ARGS
4650 IEMIMPL_AVX_PROLOGUE
4651 AVX_LD_XSAVEAREA_MXCSR A0
4652
4653 vmovdqu xmm0, [A2]
4654 vmovdqu xmm1, [A3]
4655 v %+ %1 xmm0, xmm0, xmm1
4656 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4657
4658 AVX128_ST_XSAVEAREA_MXCSR A1
4659 IEMIMPL_AVX_PROLOGUE
4660 EPILOGUE_4_ARGS
4661ENDPROC iemAImpl_v %+ %1 %+ _u128
4662
4663BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4664 PROLOGUE_4_ARGS
4665 IEMIMPL_AVX_PROLOGUE
4666 AVX_LD_XSAVEAREA_MXCSR A0
4667
4668 vmovdqu ymm0, [A2]
4669 vmovdqu ymm1, [A3]
4670 v %+ %1 ymm0, ymm0, ymm1
4671 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4672
4673 AVX256_ST_XSAVEAREA_MXCSR A1
4674 IEMIMPL_AVX_PROLOGUE
4675 EPILOGUE_4_ARGS
4676ENDPROC iemAImpl_v %+ %1 %+ _u256
4677 %elif %2 == 2
4678BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4679 PROLOGUE_4_ARGS
4680 IEMIMPL_AVX_PROLOGUE
4681 AVX_LD_XSAVEAREA_MXCSR A0
4682
4683 vmovdqu xmm0, [A2]
4684 vmovdqu xmm1, [A3]
4685 v %+ %1 xmm0, xmm1
4686 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4687
4688 AVX128_ST_XSAVEAREA_MXCSR A1
4689 IEMIMPL_AVX_PROLOGUE
4690 EPILOGUE_4_ARGS
4691ENDPROC iemAImpl_v %+ %1 %+ _u128
4692
4693BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4694 PROLOGUE_4_ARGS
4695 IEMIMPL_AVX_PROLOGUE
4696 AVX_LD_XSAVEAREA_MXCSR A0
4697
4698 vmovdqu ymm0, [A2]
4699 vmovdqu ymm1, [A3]
4700 v %+ %1 ymm0, ymm1
4701 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4702
4703 AVX256_ST_XSAVEAREA_MXCSR A1
4704 IEMIMPL_AVX_PROLOGUE
4705 EPILOGUE_4_ARGS
4706ENDPROC iemAImpl_v %+ %1 %+ _u256
4707 %endif
4708%endmacro
4709
4710IEMIMPL_FP_F2 addps, 3
4711IEMIMPL_FP_F2 addpd, 3
4712IEMIMPL_FP_F2 mulps, 3
4713IEMIMPL_FP_F2 mulpd, 3
4714IEMIMPL_FP_F2 subps, 3
4715IEMIMPL_FP_F2 subpd, 3
4716IEMIMPL_FP_F2 minps, 3
4717IEMIMPL_FP_F2 minpd, 3
4718IEMIMPL_FP_F2 divps, 3
4719IEMIMPL_FP_F2 divpd, 3
4720IEMIMPL_FP_F2 maxps, 3
4721IEMIMPL_FP_F2 maxpd, 3
4722IEMIMPL_FP_F2 haddps, 3
4723IEMIMPL_FP_F2 haddpd, 3
4724IEMIMPL_FP_F2 hsubps, 3
4725IEMIMPL_FP_F2 hsubpd, 3
4726IEMIMPL_FP_F2 addsubps, 3
4727IEMIMPL_FP_F2 addsubpd, 3
4728
4729
4730;;
4731; These are actually unary operations but to keep it simple
4732; we treat them as binary for now, so the output result is
4733; always in sync with the register where the result might get written
4734; to.
4735IEMIMPL_FP_F2 sqrtps, 2
4736IEMIMPL_FP_F2 rsqrtps, 2
4737IEMIMPL_FP_F2 sqrtpd, 2
4738IEMIMPL_FP_F2 cvtdq2ps, 2
4739IEMIMPL_FP_F2 cvtps2dq, 2
4740IEMIMPL_FP_F2 cvttps2dq, 2
4741IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4742IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4743IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4744
4745
4746;;
4747; Floating point instruction working on a full sized register and a single precision operand.
4748;
4749; @param 1 The instruction
4750;
4751; @param A0 FPU context (FXSTATE or XSAVEAREA).
4752; @param A1 Where to return the result including the MXCSR value.
4753; @param A2 Pointer to the first media register size operand (input/output).
4754; @param A3 Pointer to the second single precision floating point value (input).
4755;
4756%macro IEMIMPL_FP_F2_R32 1
4757BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4758 PROLOGUE_4_ARGS
4759 IEMIMPL_SSE_PROLOGUE
4760 SSE_LD_FXSTATE_MXCSR A0
4761
4762 movdqu xmm0, [A2]
4763 movd xmm1, [A3]
4764 %1 xmm0, xmm1
4765 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4766
4767 SSE_ST_FXSTATE_MXCSR A1, A0
4768 IEMIMPL_SSE_EPILOGUE
4769 EPILOGUE_4_ARGS
4770ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4771
4772BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4773 PROLOGUE_4_ARGS
4774 IEMIMPL_AVX_PROLOGUE
4775 AVX_LD_XSAVEAREA_MXCSR A0
4776
4777 vmovdqu xmm0, [A2]
4778 vmovd xmm1, [A3]
4779 v %+ %1 xmm0, xmm0, xmm1
4780 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4781
4782 AVX128_ST_XSAVEAREA_MXCSR A1
4783 IEMIMPL_AVX_PROLOGUE
4784 EPILOGUE_4_ARGS
4785ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4786%endmacro
4787
4788IEMIMPL_FP_F2_R32 addss
4789IEMIMPL_FP_F2_R32 mulss
4790IEMIMPL_FP_F2_R32 subss
4791IEMIMPL_FP_F2_R32 minss
4792IEMIMPL_FP_F2_R32 divss
4793IEMIMPL_FP_F2_R32 maxss
4794IEMIMPL_FP_F2_R32 cvtss2sd
4795IEMIMPL_FP_F2_R32 sqrtss
4796IEMIMPL_FP_F2_R32 rsqrtss
4797
4798
4799;;
4800; Floating point instruction working on a full sized register and a double precision operand.
4801;
4802; @param 1 The instruction
4803;
4804; @param A0 FPU context (FXSTATE or XSAVEAREA).
4805; @param A1 Where to return the result including the MXCSR value.
4806; @param A2 Pointer to the first media register size operand (input/output).
4807; @param A3 Pointer to the second double precision floating point value (input).
4808;
4809%macro IEMIMPL_FP_F2_R64 1
4810BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4811 PROLOGUE_4_ARGS
4812 IEMIMPL_SSE_PROLOGUE
4813 SSE_LD_FXSTATE_MXCSR A0
4814
4815 movdqu xmm0, [A2]
4816 movq xmm1, [A3]
4817 %1 xmm0, xmm1
4818 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4819
4820 SSE_ST_FXSTATE_MXCSR A1, A0
4821 IEMIMPL_SSE_EPILOGUE
4822 EPILOGUE_4_ARGS
4823ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4824
4825BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4826 PROLOGUE_4_ARGS
4827 IEMIMPL_AVX_PROLOGUE
4828 AVX_LD_XSAVEAREA_MXCSR A0
4829
4830 vmovdqu xmm0, [A2]
4831 vmovq xmm1, [A3]
4832 v %+ %1 xmm0, xmm0, xmm1
4833 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4834
4835 AVX128_ST_XSAVEAREA_MXCSR A1
4836 IEMIMPL_AVX_EPILOGUE
4837 EPILOGUE_4_ARGS
4838ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4839%endmacro
4840
4841IEMIMPL_FP_F2_R64 addsd
4842IEMIMPL_FP_F2_R64 mulsd
4843IEMIMPL_FP_F2_R64 subsd
4844IEMIMPL_FP_F2_R64 minsd
4845IEMIMPL_FP_F2_R64 divsd
4846IEMIMPL_FP_F2_R64 maxsd
4847IEMIMPL_FP_F2_R64 cvtsd2ss
4848IEMIMPL_FP_F2_R64 sqrtsd
4849
4850
4851;;
4852; Macro for the cvtpd2ps/cvtps2pd instructions.
4853;
4854; 1 The instruction name.
4855; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4856;
4857; @param A0 FPU context (FXSTATE or XSAVEAREA).
4858; @param A1 Where to return the result including the MXCSR value.
4859; @param A2 Pointer to the first media register size operand (input/output).
4860; @param A3 Pointer to the second media register size operand (input).
4861;
4862%macro IEMIMPL_CVT_F2 2
4863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4864 PROLOGUE_4_ARGS
4865 IEMIMPL_SSE_PROLOGUE
4866 SSE_LD_FXSTATE_MXCSR A0
4867
4868 movdqu xmm0, [A2]
4869 movdqu xmm1, [A3]
4870 %1 xmm0, xmm1
4871 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4872
4873 SSE_ST_FXSTATE_MXCSR A1, A0
4874 IEMIMPL_SSE_EPILOGUE
4875 EPILOGUE_4_ARGS
4876ENDPROC iemAImpl_ %+ %1 %+ _u128
4877
4878BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4879 PROLOGUE_4_ARGS
4880 IEMIMPL_AVX_PROLOGUE
4881 AVX_LD_XSAVEAREA_MXCSR A0
4882
4883 vmovdqu xmm0, [A2]
4884 vmovdqu xmm1, [A3]
4885 v %+ %1 xmm0, xmm1
4886 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4887
4888 AVX128_ST_XSAVEAREA_MXCSR A1
4889 IEMIMPL_AVX_EPILOGUE
4890 EPILOGUE_4_ARGS
4891ENDPROC iemAImpl_v %+ %1 %+ _u128
4892
4893BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4894 PROLOGUE_4_ARGS
4895 IEMIMPL_AVX_PROLOGUE
4896 AVX_LD_XSAVEAREA_MXCSR A0
4897
4898 vmovdqu ymm0, [A2]
4899 vmovdqu ymm1, [A3]
4900 %if %2 == 0
4901 v %+ %1 xmm0, ymm1
4902 %else
4903 v %+ %1 ymm0, xmm1
4904 %endif
4905 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4906
4907 AVX256_ST_XSAVEAREA_MXCSR A1
4908 IEMIMPL_AVX_EPILOGUE
4909 EPILOGUE_4_ARGS
4910ENDPROC iemAImpl_v %+ %1 %+ _u256
4911%endmacro
4912
4913IEMIMPL_CVT_F2 cvtpd2ps, 0
4914IEMIMPL_CVT_F2 cvtps2pd, 1
4915
4916
4917;;
4918; shufps instructions with 8-bit immediates.
4919;
4920; @param A0 Pointer to the destination media register size operand (input/output).
4921; @param A1 Pointer to the first source media register size operand (input).
4922; @param A2 The 8-bit immediate
4923;
4924BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4925 PROLOGUE_3_ARGS
4926 IEMIMPL_SSE_PROLOGUE
4927
4928 movdqu xmm0, [A0]
4929 movdqu xmm1, [A1]
4930 lea T1, [.imm0 xWrtRIP]
4931 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4932 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret+int3) == 10: A2 * 10 = (A2 * 5) * 2
4933 %else
4934 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: A2 * 6 = (A2 * 3) * 2
4935 %endif
4936 lea T1, [T1 + T0*2]
4937 IBT_NOTRACK
4938 call T1
4939 movdqu [A0], xmm0
4940
4941 IEMIMPL_SSE_EPILOGUE
4942 EPILOGUE_3_ARGS
4943 %assign bImm 0
4944 %rep 256
4945.imm %+ bImm:
4946 IBT_ENDBRxx_WITHOUT_NOTRACK
4947 shufps xmm0, xmm1, bImm
4948 ret
4949 int3
4950 %assign bImm bImm + 1
4951 %endrep
4952.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4953ENDPROC iemAImpl_shufps_u128
4954
4955
4956;;
4957; shufpd instruction with 8-bit immediates.
4958;
4959; @param A0 Pointer to the destination media register size operand (input/output).
4960; @param A1 Pointer to the first source media register size operand (input).
4961; @param A2 The 8-bit immediate
4962;
4963BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4964 PROLOGUE_3_ARGS
4965 IEMIMPL_SSE_PROLOGUE
4966
4967 movdqu xmm0, [A0]
4968 movdqu xmm1, [A1]
4969 lea T1, [.imm0 xWrtRIP]
4970 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
4971 lea T0, [A2 + A2*4] ; sizeof(shufpX+ret) == 10: A2 * 10 = (A2 * 5) * 2
4972 %else
4973 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: A2 * 6 = (A2 * 3) * 2
4974 %endif
4975 lea T1, [T1 + T0*2]
4976 IBT_NOTRACK
4977 call T1
4978 movdqu [A0], xmm0
4979
4980 IEMIMPL_SSE_EPILOGUE
4981 EPILOGUE_3_ARGS
4982 %assign bImm 0
4983 %rep 256
4984.imm %+ bImm:
4985 IBT_ENDBRxx_WITHOUT_NOTRACK
4986 shufpd xmm0, xmm1, bImm
4987 ret
4988 %assign bImm bImm + 1
4989 %endrep
4990.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
4991ENDPROC iemAImpl_shufpd_u128
4992
4993
4994;;
4995; vshufp{s,d} instructions with 8-bit immediates.
4996;
4997; @param 1 The instruction name.
4998;
4999; @param A0 Pointer to the destination media register size operand (output).
5000; @param A1 Pointer to the first source media register size operand (input).
5001; @param A2 Pointer to the second source media register size operand (input).
5002; @param A3 The 8-bit immediate
5003;
5004%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
5005BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5006 PROLOGUE_4_ARGS
5007 IEMIMPL_AVX_PROLOGUE
5008
5009 movdqu xmm0, [A1]
5010 movdqu xmm1, [A2]
5011 lea T1, [.imm0 xWrtRIP]
5012 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5013 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5014 %else
5015 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5016 %endif
5017 lea T1, [T1 + T0*2]
5018 IBT_NOTRACK
5019 call T1
5020 movdqu [A0], xmm0
5021
5022 IEMIMPL_AVX_EPILOGUE
5023 EPILOGUE_4_ARGS
5024 %assign bImm 0
5025 %rep 256
5026.imm %+ bImm:
5027 IBT_ENDBRxx_WITHOUT_NOTRACK
5028 %1 xmm0, xmm0, xmm1, bImm
5029 ret
5030 %assign bImm bImm + 1
5031 %endrep
5032.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5033ENDPROC iemAImpl_ %+ %1 %+ _u128
5034
5035BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5036 PROLOGUE_4_ARGS
5037 IEMIMPL_AVX_PROLOGUE
5038
5039 vmovdqu ymm0, [A1]
5040 vmovdqu ymm1, [A2]
5041 lea T1, [.imm0 xWrtRIP]
5042 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5043 lea T0, [A3 + A3*4] ; sizeof(vshufpX+ret) == 10: A3 * 10 = (A3 * 5) * 2
5044 %else
5045 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: A3 * 6 = (A3 * 3) * 2
5046 %endif
5047 lea T1, [T1 + T0*2]
5048 IBT_NOTRACK
5049 call T1
5050 vmovdqu [A0], ymm0
5051
5052 IEMIMPL_AVX_EPILOGUE
5053 EPILOGUE_4_ARGS
5054 %assign bImm 0
5055 %rep 256
5056.imm %+ bImm:
5057 IBT_ENDBRxx_WITHOUT_NOTRACK
5058 %1 ymm0, ymm0, ymm1, bImm
5059 ret
5060 %assign bImm bImm + 1
5061 %endrep
5062.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5063ENDPROC iemAImpl_ %+ %1 %+ _u256
5064%endmacro
5065
5066IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5067IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5068
5069
5070;;
5071; One of the [p]blendv{b,ps,pd} variants
5072;
5073; @param 1 The instruction
5074;
5075; @param A0 Pointer to the first media register sized operand (input/output).
5076; @param A1 Pointer to the second media sized value (input).
5077; @param A2 Pointer to the media register sized mask value (input).
5078;
5079%macro IEMIMPL_P_BLEND 1
5080BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5081 PROLOGUE_3_ARGS
5082 IEMIMPL_SSE_PROLOGUE
5083
5084 movdqu xmm0, [A2] ; This is implicit
5085 movdqu xmm1, [A0]
5086 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5087 %1 xmm1, xmm2
5088 movdqu [A0], xmm1
5089
5090 IEMIMPL_SSE_PROLOGUE
5091 EPILOGUE_3_ARGS
5092ENDPROC iemAImpl_ %+ %1 %+ _u128
5093%endmacro
5094
5095IEMIMPL_P_BLEND pblendvb
5096IEMIMPL_P_BLEND blendvps
5097IEMIMPL_P_BLEND blendvpd
5098
5099
5100;;
5101; One of the v[p]blendv{b,ps,pd} variants
5102;
5103; @param 1 The instruction
5104;
5105; @param A0 Pointer to the first media register sized operand (output).
5106; @param A1 Pointer to the first media register sized operand (input).
5107; @param A2 Pointer to the second media register sized operand (input).
5108; @param A3 Pointer to the media register sized mask value (input).
5109%macro IEMIMPL_AVX_P_BLEND 1
5110BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5111 PROLOGUE_4_ARGS
5112 IEMIMPL_AVX_PROLOGUE
5113
5114 vmovdqu xmm0, [A1]
5115 vmovdqu xmm1, [A2]
5116 vmovdqu xmm2, [A3]
5117 %1 xmm0, xmm0, xmm1, xmm2
5118 vmovdqu [A0], xmm0
5119
5120 IEMIMPL_AVX_PROLOGUE
5121 EPILOGUE_4_ARGS
5122ENDPROC iemAImpl_ %+ %1 %+ _u128
5123
5124BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5125 PROLOGUE_4_ARGS
5126 IEMIMPL_AVX_PROLOGUE
5127
5128 vmovdqu ymm0, [A1]
5129 vmovdqu ymm1, [A2]
5130 vmovdqu ymm2, [A3]
5131 %1 ymm0, ymm0, ymm1, ymm2
5132 vmovdqu [A0], ymm0
5133
5134 IEMIMPL_AVX_PROLOGUE
5135 EPILOGUE_4_ARGS
5136ENDPROC iemAImpl_ %+ %1 %+ _u256
5137%endmacro
5138
5139IEMIMPL_AVX_P_BLEND vpblendvb
5140IEMIMPL_AVX_P_BLEND vblendvps
5141IEMIMPL_AVX_P_BLEND vblendvpd
5142
5143
5144;;
5145; palignr mm1, mm2/m64 instruction.
5146;
5147; @param A0 Pointer to the first media register sized operand (output).
5148; @param A1 The second register sized operand (input).
5149; @param A2 The 8-bit immediate.
5150BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5151 PROLOGUE_3_ARGS
5152 IEMIMPL_MMX_PROLOGUE
5153
5154 movq mm0, [A0]
5155 movq mm1, A1
5156 lea T1, [.imm0 xWrtRIP]
5157 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5158 lea T0, [A2 + A2*4] ; sizeof(endbrxx+palignr+ret) == 10: A2 * 10 = (A2 * 5) * 2
5159 %else
5160 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: A2 * 6 = (A2 * 3) * 2
5161 %endif
5162 lea T1, [T1 + T0*2]
5163 IBT_NOTRACK
5164 call T1
5165 movq [A0], mm0
5166
5167 IEMIMPL_MMX_EPILOGUE
5168 EPILOGUE_3_ARGS
5169 %assign bImm 0
5170 %rep 256
5171.imm %+ bImm:
5172 IBT_ENDBRxx_WITHOUT_NOTRACK
5173 palignr mm0, mm1, bImm
5174 ret
5175 %assign bImm bImm + 1
5176 %endrep
5177.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5178ENDPROC iemAImpl_palignr_u64
5179
5180
5181;;
5182; SSE instructions with 8-bit immediates of the form
5183; xxx xmm1, xmm2, imm8.
5184; where the instruction encoding takes up 6 bytes.
5185;
5186; @param 1 The instruction name.
5187;
5188; @param A0 Pointer to the first media register size operand (input/output).
5189; @param A1 Pointer to the second source media register size operand (input).
5190; @param A2 The 8-bit immediate
5191;
5192%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5193BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5194 PROLOGUE_3_ARGS
5195 IEMIMPL_SSE_PROLOGUE
5196
5197 movdqu xmm0, [A0]
5198 movdqu xmm1, [A1]
5199 lea T1, [.imm0 xWrtRIP]
5200 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5201 lea T0, [A2 + A2*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A2 * 12 = (A2 * 3) * 4
5202 lea T1, [T1 + T0*4]
5203 %else
5204 lea T1, [T1 + A2*8] ; sizeof(insnX+ret+int3) == 8: A2 * 8
5205 %endif
5206 IBT_NOTRACK
5207 call T1
5208 movdqu [A0], xmm0
5209
5210 IEMIMPL_SSE_EPILOGUE
5211 EPILOGUE_3_ARGS
5212 %assign bImm 0
5213 %rep 256
5214.imm %+ bImm:
5215 IBT_ENDBRxx_WITHOUT_NOTRACK
5216 %1 xmm0, xmm1, bImm
5217 ret
5218 int3
5219 %assign bImm bImm + 1
5220 %endrep
5221.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5222ENDPROC iemAImpl_ %+ %1 %+ _u128
5223%endmacro
5224
5225IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5226IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5227IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5228IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5229IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5230IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5231IEMIMPL_MEDIA_SSE_INSN_IMM8_6 mpsadbw
5232
5233
5234;;
5235; AVX instructions with 8-bit immediates of the form
5236; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5237; where the instruction encoding takes up 6 bytes.
5238;
5239; @param 1 The instruction name.
5240; @param 2 Whether the instruction has a 128-bit variant (1) or not (0).
5241; @param 3 Whether the instruction has a 256-bit variant (1) or not (0).
5242;
5243; @param A0 Pointer to the destination media register size operand (output).
5244; @param A1 Pointer to the first source media register size operand (input).
5245; @param A2 Pointer to the second source media register size operand (input).
5246; @param A3 The 8-bit immediate
5247;
5248%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 3
5249 %if %2 == 1
5250BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5251 PROLOGUE_4_ARGS
5252 IEMIMPL_AVX_PROLOGUE
5253
5254 movdqu xmm0, [A1]
5255 movdqu xmm1, [A2]
5256 lea T1, [.imm0 xWrtRIP]
5257 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5258 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5259 lea T1, [T1 + T0*4]
5260 %else
5261 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5262 %endif
5263 IBT_NOTRACK
5264 call T1
5265 movdqu [A0], xmm0
5266
5267 IEMIMPL_AVX_EPILOGUE
5268 EPILOGUE_4_ARGS
5269 %assign bImm 0
5270 %rep 256
5271.imm %+ bImm:
5272 IBT_ENDBRxx_WITHOUT_NOTRACK
5273 %1 xmm0, xmm0, xmm1, bImm
5274 ret
5275 int3
5276 %assign bImm bImm + 1
5277 %endrep
5278.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5279ENDPROC iemAImpl_ %+ %1 %+ _u128
5280 %endif
5281
5282 %if %3 == 1
5283BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5284 PROLOGUE_4_ARGS
5285 IEMIMPL_AVX_PROLOGUE
5286
5287 vmovdqu ymm0, [A1]
5288 vmovdqu ymm1, [A2]
5289 lea T1, [.imm0 xWrtRIP]
5290 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5291 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
5292 lea T1, [T1 + T0*4]
5293 %else
5294 lea T1, [T1 + A3*8] ; sizeof(insnX+ret+int3) == 8: A3 * 8
5295 %endif
5296 IBT_NOTRACK
5297 call T1
5298 vmovdqu [A0], ymm0
5299
5300 IEMIMPL_AVX_EPILOGUE
5301 EPILOGUE_4_ARGS
5302 %assign bImm 0
5303 %rep 256
5304.imm %+ bImm:
5305 IBT_ENDBRxx_WITHOUT_NOTRACK
5306 %1 ymm0, ymm0, ymm1, bImm
5307 ret
5308 int3
5309 %assign bImm bImm + 1
5310 %endrep
5311.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5312ENDPROC iemAImpl_ %+ %1 %+ _u256
5313 %endif
5314%endmacro
5315
5316IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1, 1
5317IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1, 1
5318IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1, 1
5319IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1, 1
5320IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 1, 0
5321IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2i128, 0, 1
5322IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vperm2f128, 0, 1
5323
5324
5325;;
5326; Need to move this as well somewhere better?
5327;
5328struc IEMPCMPISTRXSRC
5329 .uSrc1 resd 4
5330 .uSrc2 resd 4
5331endstruc
5332
5333struc IEMPCMPESTRXSRC
5334 .uSrc1 resd 4
5335 .uSrc2 resd 4
5336 .u64Rax resd 2
5337 .u64Rdx resd 2
5338endstruc
5339
5340;;
5341; The pcmpistri instruction.
5342;
5343; @param A0 Pointer to the ECX register to store the result to (output).
5344; @param A1 Pointer to the EFLAGS register.
5345; @param A2 Pointer to the structure containing the source operands (input).
5346; @param A3 The 8-bit immediate
5347;
5348BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5349 PROLOGUE_4_ARGS
5350 IEMIMPL_SSE_PROLOGUE
5351
5352 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5353 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5354 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5355 lea T1, [.imm0 xWrtRIP]
5356 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5357 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5358 lea T1, [T1 + T0*4]
5359 %else
5360 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5361 %endif
5362 IBT_NOTRACK
5363 call T1
5364
5365 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5366 mov [T2], ecx
5367
5368 IEMIMPL_SSE_EPILOGUE
5369 EPILOGUE_4_ARGS
5370 %assign bImm 0
5371 %rep 256
5372.imm %+ bImm:
5373 IBT_ENDBRxx_WITHOUT_NOTRACK
5374 pcmpistri xmm0, xmm1, bImm
5375 ret
5376 int3
5377 %assign bImm bImm + 1
5378 %endrep
5379.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5380ENDPROC iemAImpl_pcmpistri_u128
5381
5382;;
5383; The pcmpestri instruction.
5384;
5385; @param A0 Pointer to the ECX register to store the result to (output).
5386; @param A1 Pointer to the EFLAGS register.
5387; @param A2 Pointer to the structure containing the source operands (input).
5388; @param A3 The 8-bit immediate
5389;
5390BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5391 PROLOGUE_4_ARGS
5392 IEMIMPL_SSE_PROLOGUE
5393
5394 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5395 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5396 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5397 lea T1, [.imm0 xWrtRIP]
5398 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5399 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5400 lea T1, [T1 + T0*4]
5401 %else
5402 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5403 %endif
5404 push xDX ; xDX can be A1 or A2 depending on the calling convention
5405 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5406 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5407 IBT_NOTRACK
5408 call T1
5409
5410 pop xDX
5411 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5412 mov [T2], ecx
5413
5414 IEMIMPL_SSE_EPILOGUE
5415 EPILOGUE_4_ARGS
5416 %assign bImm 0
5417 %rep 256
5418.imm %+ bImm:
5419 IBT_ENDBRxx_WITHOUT_NOTRACK
5420 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5421 pcmpestri xmm0, xmm1, bImm
5422 ret
5423 %assign bImm bImm + 1
5424 %endrep
5425.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5426ENDPROC iemAImpl_pcmpestri_u128
5427
5428;;
5429; The pcmpistrm instruction template.
5430;
5431; @param A0 Pointer to the XMM0 register to store the result to (output).
5432; @param A1 Pointer to the EFLAGS register.
5433; @param A2 Pointer to the structure containing the source operands (input).
5434; @param A3 The 8-bit immediate
5435;
5436BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5437 PROLOGUE_4_ARGS
5438 IEMIMPL_SSE_PROLOGUE
5439
5440 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5441 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5442 lea T1, [.imm0 xWrtRIP]
5443 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5444 lea T0, [A3 + A3*2] ; sizeof(endbrxx+pcmpistrm+ret) == 12: A3 * 12 = (A3 * 3) * 4
5445 lea T1, [T1 + T0*4]
5446 %else
5447 lea T0, [T1 + A3*8] ; sizeof(pcmpistrm+ret) == 8: A3 * 8
5448 %endif
5449 IBT_NOTRACK
5450 call T1
5451
5452 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5453 movdqu [A0], xmm0
5454
5455 IEMIMPL_SSE_EPILOGUE
5456 EPILOGUE_4_ARGS
5457 %assign bImm 0
5458 %rep 256
5459.imm %+ bImm:
5460 IBT_ENDBRxx_WITHOUT_NOTRACK
5461 pcmpistrm xmm1, xmm2, bImm
5462 ret
5463 int3
5464 %assign bImm bImm + 1
5465 %endrep
5466.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5467ENDPROC iemAImpl_pcmpistrm_u128
5468
5469;;
5470; The pcmpestrm instruction template.
5471;
5472; @param A0 Pointer to the XMM0 register to store the result to (output).
5473; @param A1 Pointer to the EFLAGS register.
5474; @param A2 Pointer to the structure containing the source operands (input).
5475; @param A3 The 8-bit immediate
5476;
5477BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5478 PROLOGUE_4_ARGS
5479 IEMIMPL_SSE_PROLOGUE
5480
5481 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5482 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5483 lea T1, [.imm0 xWrtRIP]
5484 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5485 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insnX+ret) == 12: A3 * 12 = (A3 * 3) * 4
5486 lea T1, [T1 + T0*4]
5487 %else
5488 lea T1, [T1 + A3*8] ; sizeof(insnX+ret) == 8: A3 * 8
5489 %endif
5490 push xDX ; xDX can be A1 or A2 depending on the calling convention
5491 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5492 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5493 IBT_NOTRACK
5494 call T1
5495
5496 pop xDX
5497 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5498 movdqu [A0], xmm0
5499
5500 IEMIMPL_SSE_EPILOGUE
5501 EPILOGUE_4_ARGS
5502 %assign bImm 0
5503 %rep 256
5504.imm %+ bImm:
5505 IBT_ENDBRxx_WITHOUT_NOTRACK
5506 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5507 pcmpestrm xmm1, xmm2, bImm
5508 ret
5509 %assign bImm bImm + 1
5510 %endrep
5511.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
5512ENDPROC iemAImpl_pcmpestrm_u128
5513
5514
5515;;
5516; pinsrw instruction.
5517;
5518; @param A0 Pointer to the first media register size operand (input/output).
5519; @param A1 The 16 bit input operand (input).
5520; @param A2 The 8-bit immediate
5521;
5522BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5523 PROLOGUE_3_ARGS
5524 IEMIMPL_SSE_PROLOGUE
5525
5526 movq mm0, [A0]
5527 lea T1, [.imm0 xWrtRIP]
5528 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5529 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pinsrw+ret) == 9: A2 * 9
5530 %else
5531 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5: A2 * 5
5532 %endif
5533 lea T1, [T1 + T0]
5534 IBT_NOTRACK
5535 call T1
5536 movq [A0], mm0
5537
5538 IEMIMPL_SSE_EPILOGUE
5539 EPILOGUE_3_ARGS
5540 %assign bImm 0
5541 %rep 256
5542.imm %+ bImm:
5543 IBT_ENDBRxx_WITHOUT_NOTRACK
5544 pinsrw mm0, A1_32, bImm
5545 ret
5546 %assign bImm bImm + 1
5547 %endrep
5548.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5549ENDPROC iemAImpl_pinsrw_u64
5550
5551BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5552 PROLOGUE_3_ARGS
5553 IEMIMPL_SSE_PROLOGUE
5554
5555 movdqu xmm0, [A0]
5556 lea T1, [.imm0 xWrtRIP]
5557 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5558 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pinsrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5559 %else
5560 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5561 %endif
5562 lea T1, [T1 + T0*2]
5563 IBT_NOTRACK
5564 call T1
5565 movdqu [A0], xmm0
5566
5567 IEMIMPL_SSE_EPILOGUE
5568 EPILOGUE_3_ARGS
5569 %assign bImm 0
5570 %rep 256
5571.imm %+ bImm:
5572 IBT_ENDBRxx_WITHOUT_NOTRACK
5573 pinsrw xmm0, A1_32, bImm
5574 ret
5575 %assign bImm bImm + 1
5576 %endrep
5577.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5578ENDPROC iemAImpl_pinsrw_u128
5579
5580;;
5581; vpinsrw instruction.
5582;
5583; @param A0 Pointer to the first media register size operand (output).
5584; @param A1 Pointer to the source media register size operand (input).
5585; @param A2 The 16 bit input operand (input).
5586; @param A3 The 8-bit immediate
5587;
5588BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5589 PROLOGUE_4_ARGS
5590 IEMIMPL_SSE_PROLOGUE
5591
5592 movdqu xmm0, [A1]
5593 lea T1, [.imm0 xWrtRIP]
5594 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5595 lea T0, [A3 + A3*4] ; sizeof(endbrxx+vpinsrw+ret) == 10: A3 * 10 = (A3 * 5) * 2
5596 %else
5597 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: A3 * 6 = (A3 * 3) * 2
5598 %endif
5599 lea T1, [T1 + T0*2]
5600 mov A1, A2 ; A2 requires longer encoding on Windows
5601 IBT_NOTRACK
5602 call T1
5603 movdqu [A0], xmm0
5604
5605 IEMIMPL_SSE_EPILOGUE
5606 EPILOGUE_4_ARGS
5607 %assign bImm 0
5608 %rep 256
5609.imm %+ bImm:
5610 IBT_ENDBRxx_WITHOUT_NOTRACK
5611 vpinsrw xmm0, xmm0, A1_32, bImm
5612 ret
5613 %assign bImm bImm + 1
5614 %endrep
5615.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5616ENDPROC iemAImpl_vpinsrw_u128
5617
5618
5619;;
5620; pextrw instruction.
5621;
5622; @param A0 Pointer to the 16bit output operand (output).
5623; @param A1 Pointer to the media register size operand (input).
5624; @param A2 The 8-bit immediate
5625;
5626BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5627 PROLOGUE_3_ARGS
5628 IEMIMPL_SSE_PROLOGUE
5629
5630 movq mm0, A1
5631 lea T1, [.imm0 xWrtRIP]
5632 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5633 lea T0, [A2 + A2*8] ; sizeof(endbrxx+pextrw+ret) == 9: A2 * 9
5634 %else
5635 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5: A2 * 5
5636 %endif
5637 lea T1, [T1 + T0]
5638 IBT_NOTRACK
5639 call T1
5640 mov word [A0], T0_16
5641
5642 IEMIMPL_SSE_EPILOGUE
5643 EPILOGUE_3_ARGS
5644 %assign bImm 0
5645 %rep 256
5646.imm %+ bImm:
5647 IBT_ENDBRxx_WITHOUT_NOTRACK
5648 pextrw T0_32, mm0, bImm
5649 ret
5650 %assign bImm bImm + 1
5651 %endrep
5652.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
5653ENDPROC iemAImpl_pextrw_u64
5654
5655BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5656 PROLOGUE_3_ARGS
5657 IEMIMPL_SSE_PROLOGUE
5658
5659 movdqu xmm0, [A1]
5660 lea T1, [.imm0 xWrtRIP]
5661 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5662 lea T0, [A2 + A2*4] ; sizeof(endbrxx+pextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5663 %else
5664 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5665 %endif
5666 lea T1, [T1 + T0*2]
5667 IBT_NOTRACK
5668 call T1
5669 mov word [A0], T0_16
5670
5671 IEMIMPL_SSE_EPILOGUE
5672 EPILOGUE_3_ARGS
5673 %assign bImm 0
5674 %rep 256
5675.imm %+ bImm:
5676 IBT_ENDBRxx_WITHOUT_NOTRACK
5677 pextrw T0_32, xmm0, bImm
5678 ret
5679 %assign bImm bImm + 1
5680 %endrep
5681.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5682ENDPROC iemAImpl_pextrw_u128
5683
5684;;
5685; vpextrw instruction.
5686;
5687; @param A0 Pointer to the 16bit output operand (output).
5688; @param A1 Pointer to the source media register size operand (input).
5689; @param A2 The 8-bit immediate
5690;
5691BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5692 PROLOGUE_3_ARGS
5693 IEMIMPL_SSE_PROLOGUE
5694
5695 movdqu xmm0, [A1]
5696 lea T1, [.imm0 xWrtRIP]
5697 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
5698 lea T0, [A2 + A2*4] ; sizeof(endbrxx+vpextrw+ret) == 10: A2 * 10 = (A2 * 5) * 2
5699 %else
5700 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: A2 * 6 = (A2 * 3) * 2
5701 %endif
5702 lea T1, [T1 + T0*2]
5703 IBT_NOTRACK
5704 call T1
5705 mov word [A0], T0_16
5706
5707 IEMIMPL_SSE_EPILOGUE
5708 EPILOGUE_3_ARGS
5709 %assign bImm 0
5710 %rep 256
5711.imm %+ bImm:
5712 IBT_ENDBRxx_WITHOUT_NOTRACK
5713 vpextrw T0_32, xmm0, bImm
5714 ret
5715 %assign bImm bImm + 1
5716 %endrep
5717.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
5718ENDPROC iemAImpl_vpextrw_u128
5719
5720
5721;;
5722; movmskp{s,d} SSE instruction template
5723;
5724; @param 1 The SSE instruction name.
5725; @param 2 The AVX instruction name.
5726;
5727; @param A0 Pointer to the output register (output/byte sized).
5728; @param A1 Pointer to the source media register size operand (input).
5729;
5730%macro IEMIMPL_MEDIA_MOVMSK_P 2
5731BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5732 PROLOGUE_2_ARGS
5733 IEMIMPL_SSE_PROLOGUE
5734
5735 movdqu xmm0, [A1]
5736 %1 T0, xmm0
5737 mov byte [A0], T0_8
5738
5739 IEMIMPL_SSE_EPILOGUE
5740 EPILOGUE_2_ARGS
5741ENDPROC iemAImpl_ %+ %1 %+ _u128
5742
5743BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5744 PROLOGUE_2_ARGS
5745 IEMIMPL_AVX_PROLOGUE
5746
5747 movdqu xmm0, [A1]
5748 %2 T0, xmm0
5749 mov byte [A0], T0_8
5750
5751 IEMIMPL_AVX_EPILOGUE
5752 EPILOGUE_2_ARGS
5753ENDPROC iemAImpl_ %+ %2 %+ _u128
5754
5755BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5756 PROLOGUE_2_ARGS
5757 IEMIMPL_AVX_PROLOGUE
5758
5759 vmovdqu ymm0, [A1]
5760 %2 T0, ymm0
5761 mov byte [A0], T0_8
5762
5763 IEMIMPL_AVX_EPILOGUE
5764 EPILOGUE_2_ARGS
5765ENDPROC iemAImpl_ %+ %2 %+ _u256
5766%endmacro
5767
5768IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5769IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5770
5771
5772;;
5773; Restores the SSE MXCSR register with the original value.
5774;
5775; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5776; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5777; @param 2 Expression giving the address of the FXSTATE of the guest.
5778;
5779; @note Restores the stack pointer.
5780;
5781%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5782 sub xSP, 4
5783 stmxcsr [xSP]
5784 mov T0_32, [xSP]
5785 add xSP, 4
5786 ; Merge the status bits into the original MXCSR value.
5787 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5788 and T0_32, X86_MXCSR_XCPT_FLAGS
5789 or T0_32, T1_32
5790 mov [%1], T0_32
5791
5792 ldmxcsr [xSP]
5793 add xSP, 4
5794%endmacro
5795
5796
5797;;
5798; cvttsd2si instruction - 32-bit variant.
5799;
5800; @param A0 FPU context (FXSTATE or XSAVEAREA).
5801; @param A1 Where to return the MXCSR value.
5802; @param A2 Pointer to the result operand (output).
5803; @param A3 Pointer to the second operand (input).
5804;
5805BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5806 PROLOGUE_4_ARGS
5807 IEMIMPL_SSE_PROLOGUE
5808 SSE_LD_FXSTATE_MXCSR A0
5809
5810 cvttsd2si T0_32, [A3]
5811 mov dword [A2], T0_32
5812
5813 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5814 IEMIMPL_SSE_EPILOGUE
5815 EPILOGUE_4_ARGS
5816ENDPROC iemAImpl_cvttsd2si_i32_r64
5817
5818;;
5819; cvttsd2si instruction - 64-bit variant.
5820;
5821; @param A0 FPU context (FXSTATE or XSAVEAREA).
5822; @param A1 Where to return the MXCSR value.
5823; @param A2 Pointer to the result operand (output).
5824; @param A3 Pointer to the second operand (input).
5825;
5826BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5827 PROLOGUE_4_ARGS
5828 IEMIMPL_SSE_PROLOGUE
5829 SSE_LD_FXSTATE_MXCSR A0
5830
5831 cvttsd2si T0, [A3]
5832 mov qword [A2], T0
5833
5834 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5835 IEMIMPL_SSE_EPILOGUE
5836 EPILOGUE_4_ARGS
5837ENDPROC iemAImpl_cvttsd2si_i64_r64
5838
5839
5840;;
5841; cvtsd2si instruction - 32-bit variant.
5842;
5843; @param A0 FPU context (FXSTATE or XSAVEAREA).
5844; @param A1 Where to return the MXCSR value.
5845; @param A2 Pointer to the result operand (output).
5846; @param A3 Pointer to the second operand (input).
5847;
5848BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5849 PROLOGUE_4_ARGS
5850 IEMIMPL_SSE_PROLOGUE
5851 SSE_LD_FXSTATE_MXCSR A0
5852
5853 cvtsd2si T0_32, [A3]
5854 mov dword [A2], T0_32
5855
5856 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5857 IEMIMPL_SSE_EPILOGUE
5858 EPILOGUE_4_ARGS
5859ENDPROC iemAImpl_cvtsd2si_i32_r64
5860
5861;;
5862; cvtsd2si instruction - 64-bit variant.
5863;
5864; @param A0 FPU context (FXSTATE or XSAVEAREA).
5865; @param A1 Where to return the MXCSR value.
5866; @param A2 Pointer to the result operand (output).
5867; @param A3 Pointer to the second operand (input).
5868;
5869BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5870 PROLOGUE_4_ARGS
5871 IEMIMPL_SSE_PROLOGUE
5872 SSE_LD_FXSTATE_MXCSR A0
5873
5874 cvtsd2si T0, [A3]
5875 mov qword [A2], T0
5876
5877 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5878 IEMIMPL_SSE_EPILOGUE
5879 EPILOGUE_4_ARGS
5880ENDPROC iemAImpl_cvtsd2si_i64_r64
5881
5882
5883;;
5884; cvttss2si instruction - 32-bit variant.
5885;
5886; @param A0 FPU context (FXSTATE or XSAVEAREA).
5887; @param A1 Where to return the MXCSR value.
5888; @param A2 Pointer to the result operand (output).
5889; @param A3 Pointer to the second operand (input).
5890;
5891BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5892 PROLOGUE_4_ARGS
5893 IEMIMPL_SSE_PROLOGUE
5894 SSE_LD_FXSTATE_MXCSR A0
5895
5896 cvttss2si T0_32, [A3]
5897 mov dword [A2], T0_32
5898
5899 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5900 IEMIMPL_SSE_EPILOGUE
5901 EPILOGUE_4_ARGS
5902ENDPROC iemAImpl_cvttss2si_i32_r32
5903
5904;;
5905; cvttss2si instruction - 64-bit variant.
5906;
5907; @param A0 FPU context (FXSTATE or XSAVEAREA).
5908; @param A1 Where to return the MXCSR value.
5909; @param A2 Pointer to the result operand (output).
5910; @param A3 Pointer to the second operand (input).
5911;
5912BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5913 PROLOGUE_4_ARGS
5914 IEMIMPL_SSE_PROLOGUE
5915 SSE_LD_FXSTATE_MXCSR A0
5916
5917 cvttss2si T0, [A3]
5918 mov qword [A2], T0
5919
5920 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5921 IEMIMPL_SSE_EPILOGUE
5922 EPILOGUE_4_ARGS
5923ENDPROC iemAImpl_cvttss2si_i64_r32
5924
5925
5926;;
5927; cvtss2si instruction - 32-bit variant.
5928;
5929; @param A0 FPU context (FXSTATE or XSAVEAREA).
5930; @param A1 Where to return the MXCSR value.
5931; @param A2 Pointer to the result operand (output).
5932; @param A3 Pointer to the second operand (input).
5933;
5934BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5935 PROLOGUE_4_ARGS
5936 IEMIMPL_SSE_PROLOGUE
5937 SSE_LD_FXSTATE_MXCSR A0
5938
5939 cvtss2si T0_32, [A3]
5940 mov dword [A2], T0_32
5941
5942 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5943 IEMIMPL_SSE_EPILOGUE
5944 EPILOGUE_4_ARGS
5945ENDPROC iemAImpl_cvtss2si_i32_r32
5946
5947;;
5948; cvtss2si instruction - 64-bit variant.
5949;
5950; @param A0 FPU context (FXSTATE or XSAVEAREA).
5951; @param A1 Where to return the MXCSR value.
5952; @param A2 Pointer to the result operand (output).
5953; @param A3 Pointer to the second operand (input).
5954;
5955BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5956 PROLOGUE_4_ARGS
5957 IEMIMPL_SSE_PROLOGUE
5958 SSE_LD_FXSTATE_MXCSR A0
5959
5960 cvtss2si T0, [A3]
5961 mov qword [A2], T0
5962
5963 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5964 IEMIMPL_SSE_EPILOGUE
5965 EPILOGUE_4_ARGS
5966ENDPROC iemAImpl_cvtss2si_i64_r32
5967
5968
5969;;
5970; cvtsi2ss instruction - 32-bit variant.
5971;
5972; @param A0 FPU context (FXSTATE or XSAVEAREA).
5973; @param A1 Where to return the MXCSR value.
5974; @param A2 Pointer to the result operand (output).
5975; @param A3 Pointer to the second operand (input).
5976;
5977BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5978 PROLOGUE_4_ARGS
5979 IEMIMPL_SSE_PROLOGUE
5980 SSE_LD_FXSTATE_MXCSR A0
5981
5982 cvtsi2ss xmm0, dword [A3]
5983 movd dword [A2], xmm0
5984
5985 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5986 IEMIMPL_SSE_EPILOGUE
5987 EPILOGUE_4_ARGS
5988ENDPROC iemAImpl_cvtsi2ss_r32_i32
5989
5990;;
5991; cvtsi2ss instruction - 64-bit variant.
5992;
5993; @param A0 FPU context (FXSTATE or XSAVEAREA).
5994; @param A1 Where to return the MXCSR value.
5995; @param A2 Pointer to the result operand (output).
5996; @param A3 Pointer to the second operand (input).
5997;
5998BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5999 PROLOGUE_4_ARGS
6000 IEMIMPL_SSE_PROLOGUE
6001 SSE_LD_FXSTATE_MXCSR A0
6002
6003 cvtsi2ss xmm0, qword [A3]
6004 movd dword [A2], xmm0
6005
6006 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6007 IEMIMPL_SSE_EPILOGUE
6008 EPILOGUE_4_ARGS
6009ENDPROC iemAImpl_cvtsi2ss_r32_i64
6010
6011
6012;;
6013; cvtsi2sd instruction - 32-bit variant.
6014;
6015; @param A0 FPU context (FXSTATE or XSAVEAREA).
6016; @param A1 Where to return the MXCSR value.
6017; @param A2 Pointer to the result operand (output).
6018; @param A3 Pointer to the second operand (input).
6019;
6020BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
6021 PROLOGUE_4_ARGS
6022 IEMIMPL_SSE_PROLOGUE
6023 SSE_LD_FXSTATE_MXCSR A0
6024
6025 cvtsi2sd xmm0, dword [A3]
6026 movq [A2], xmm0
6027
6028 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6029 IEMIMPL_SSE_EPILOGUE
6030 EPILOGUE_4_ARGS
6031ENDPROC iemAImpl_cvtsi2sd_r64_i32
6032
6033;;
6034; cvtsi2sd instruction - 64-bit variant.
6035;
6036; @param A0 FPU context (FXSTATE or XSAVEAREA).
6037; @param A1 Where to return the MXCSR value.
6038; @param A2 Pointer to the result operand (output).
6039; @param A3 Pointer to the second operand (input).
6040;
6041BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
6042 PROLOGUE_4_ARGS
6043 IEMIMPL_SSE_PROLOGUE
6044 SSE_LD_FXSTATE_MXCSR A0
6045
6046 cvtsi2sd xmm0, qword [A3]
6047 movq [A2], xmm0
6048
6049 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
6050 IEMIMPL_SSE_EPILOGUE
6051 EPILOGUE_4_ARGS
6052ENDPROC iemAImpl_cvtsi2sd_r64_i64
6053
6054
6055;;
6056; Initialize the SSE MXCSR register using the guest value partially to
6057; account for rounding mode.
6058;
6059; @uses 4 bytes of stack to save the original value, T0.
6060; @param 1 Expression giving the address of the MXCSR register of the guest.
6061;
6062%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
6063 sub xSP, 4
6064
6065 stmxcsr [xSP]
6066 mov T0_32, [%1]
6067 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
6068 or T0_32, X86_MXCSR_XCPT_MASK
6069 sub xSP, 4
6070 mov [xSP], T0_32
6071 ldmxcsr [xSP]
6072 add xSP, 4
6073%endmacro
6074
6075
6076;;
6077; Restores the SSE MXCSR register with the original value.
6078;
6079; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
6080; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
6081;
6082; @note Restores the stack pointer.
6083;
6084%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
6085 sub xSP, 4
6086 stmxcsr [xSP]
6087 mov T0_32, [xSP]
6088 add xSP, 4
6089 ; Merge the status bits into the original MXCSR value.
6090 mov T1_32, [%1]
6091 and T0_32, X86_MXCSR_XCPT_FLAGS
6092 or T0_32, T1_32
6093 mov [%1], T0_32
6094
6095 ldmxcsr [xSP]
6096 add xSP, 4
6097%endmacro
6098
6099
6100;
6101; UCOMISS (SSE)
6102;
6103; @param A0 Pointer to the MXCSR value (input/output).
6104; @param A1 Pointer to the EFLAGS value (input/output).
6105; @param A2 Pointer to the first source operand (aka readonly destination).
6106; @param A3 Pointer to the second source operand.
6107;
6108BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
6109 PROLOGUE_4_ARGS
6110 IEMIMPL_SSE_PROLOGUE
6111 SSE_LD_FXSTATE_MXCSR_ONLY A0
6112
6113 movdqu xmm0, [A2]
6114 movdqu xmm1, [A3]
6115 ucomiss xmm0, xmm1
6116 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6117
6118 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6119 IEMIMPL_SSE_EPILOGUE
6120 EPILOGUE_4_ARGS
6121ENDPROC iemAImpl_ucomiss_u128
6122
6123BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
6124 PROLOGUE_4_ARGS
6125 IEMIMPL_SSE_PROLOGUE
6126 SSE_LD_FXSTATE_MXCSR_ONLY A0
6127
6128 movdqu xmm0, [A2]
6129 movdqu xmm1, [A3]
6130 vucomiss xmm0, xmm1
6131 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6132
6133 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6134 IEMIMPL_SSE_EPILOGUE
6135 EPILOGUE_4_ARGS
6136ENDPROC iemAImpl_vucomiss_u128
6137
6138
6139;
6140; UCOMISD (SSE)
6141;
6142; @param A0 Pointer to the MXCSR value (input/output).
6143; @param A1 Pointer to the EFLAGS value (input/output).
6144; @param A2 Pointer to the first source operand (aka readonly destination).
6145; @param A3 Pointer to the second source operand.
6146;
6147BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6148 PROLOGUE_4_ARGS
6149 IEMIMPL_SSE_PROLOGUE
6150 SSE_LD_FXSTATE_MXCSR_ONLY A0
6151
6152 movdqu xmm0, [A2]
6153 movdqu xmm1, [A3]
6154 ucomisd xmm0, xmm1
6155 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6156
6157 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6158 IEMIMPL_SSE_EPILOGUE
6159 EPILOGUE_4_ARGS
6160ENDPROC iemAImpl_ucomisd_u128
6161
6162BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6163 PROLOGUE_4_ARGS
6164 IEMIMPL_SSE_PROLOGUE
6165 SSE_LD_FXSTATE_MXCSR_ONLY A0
6166
6167 movdqu xmm0, [A2]
6168 movdqu xmm1, [A3]
6169 vucomisd xmm0, xmm1
6170 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6171
6172 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6173 IEMIMPL_SSE_EPILOGUE
6174 EPILOGUE_4_ARGS
6175ENDPROC iemAImpl_vucomisd_u128
6176
6177;
6178; COMISS (SSE)
6179;
6180; @param A0 Pointer to the MXCSR value (input/output).
6181; @param A1 Pointer to the EFLAGS value (input/output).
6182; @param A2 Pointer to the first source operand (aka readonly destination).
6183; @param A3 Pointer to the second source operand.
6184;
6185BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6186 PROLOGUE_4_ARGS
6187 IEMIMPL_SSE_PROLOGUE
6188 SSE_LD_FXSTATE_MXCSR_ONLY A0
6189
6190 movdqu xmm0, [A2]
6191 movdqu xmm1, [A3]
6192 comiss xmm0, xmm1
6193 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6194
6195 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6196 IEMIMPL_SSE_EPILOGUE
6197 EPILOGUE_4_ARGS
6198ENDPROC iemAImpl_comiss_u128
6199
6200BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6201 PROLOGUE_4_ARGS
6202 IEMIMPL_SSE_PROLOGUE
6203 SSE_LD_FXSTATE_MXCSR_ONLY A0
6204
6205 movdqu xmm0, [A2]
6206 movdqu xmm1, [A3]
6207 vcomiss xmm0, xmm1
6208 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6209
6210 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6211 IEMIMPL_SSE_EPILOGUE
6212 EPILOGUE_4_ARGS
6213ENDPROC iemAImpl_vcomiss_u128
6214
6215
6216;
6217; COMISD (SSE)
6218;
6219; @param A0 Pointer to the MXCSR value (input/output).
6220; @param A1 Pointer to the EFLAGS value (input/output).
6221; @param A2 Pointer to the first source operand (aka readonly destination).
6222; @param A3 Pointer to the second source operand.
6223;
6224BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6225 PROLOGUE_4_ARGS
6226 IEMIMPL_SSE_PROLOGUE
6227 SSE_LD_FXSTATE_MXCSR_ONLY A0
6228
6229 movdqu xmm0, [A2]
6230 movdqu xmm1, [A3]
6231 comisd xmm0, xmm1
6232 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6233
6234 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6235 IEMIMPL_SSE_EPILOGUE
6236 EPILOGUE_4_ARGS
6237ENDPROC iemAImpl_comisd_u128
6238
6239BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6240 PROLOGUE_4_ARGS
6241 IEMIMPL_SSE_PROLOGUE
6242 SSE_LD_FXSTATE_MXCSR_ONLY A0
6243
6244 movdqu xmm0, [A2]
6245 movdqu xmm1, [A3]
6246 vcomisd xmm0, xmm1
6247 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6248
6249 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6250 IEMIMPL_SSE_EPILOGUE
6251 EPILOGUE_4_ARGS
6252ENDPROC iemAImpl_vcomisd_u128
6253
6254
6255;;
6256; Need to move this as well somewhere better?
6257;
6258struc IEMMEDIAF2XMMSRC
6259 .uSrc1 resd 4
6260 .uSrc2 resd 4
6261endstruc
6262
6263
6264;
6265; CMPPS (SSE)
6266;
6267; @param A0 Pointer to the MXCSR value (input/output).
6268; @param A1 Pointer to the first media register size operand (output).
6269; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6270; @param A3 The 8-bit immediate (input).
6271;
6272BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6273 PROLOGUE_4_ARGS
6274 IEMIMPL_SSE_PROLOGUE
6275 SSE_LD_FXSTATE_MXCSR_ONLY A0
6276
6277 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6278 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6279 lea T1, [.imm0 xWrtRIP]
6280 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6281 lea T0, [A3 + A3*8] ; sizeof(endbrxx+cmpps+ret) == 9: A3 * 9
6282 %else
6283 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5: A3 * 5
6284 %endif
6285 lea T1, [T1 + T0]
6286 IBT_NOTRACK
6287 call T1
6288 movdqu [A1], xmm0
6289
6290 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6291 IEMIMPL_SSE_EPILOGUE
6292 EPILOGUE_4_ARGS
6293 %assign bImm 0
6294 %rep 256
6295.imm %+ bImm:
6296 IBT_ENDBRxx_WITHOUT_NOTRACK
6297 cmpps xmm0, xmm1, bImm
6298 ret
6299 %assign bImm bImm + 1
6300 %endrep
6301.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x500
6302ENDPROC iemAImpl_cmpps_u128
6303
6304;;
6305; SSE instructions with 8-bit immediates of the form
6306; xxx xmm1, xmm2, imm8.
6307; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6308; register.
6309;
6310; @param 1 The instruction name.
6311;
6312; @param A0 Pointer to the MXCSR value (input/output).
6313; @param A1 Pointer to the first media register size operand (output).
6314; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6315; @param A3 The 8-bit immediate (input).
6316;
6317%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6318BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6319 PROLOGUE_4_ARGS
6320 IEMIMPL_SSE_PROLOGUE
6321 SSE_LD_FXSTATE_MXCSR_ONLY A0
6322
6323 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6324 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6325 lea T1, [.imm0 xWrtRIP]
6326 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6327 lea T0, [A3 + A3*4] ; sizeof(endbrxx+cmpXX+ret) == 10: A3 * 10 = (A3 * 5) * 2
6328 %else
6329 lea T0, [A3 + A3*2] ; sizeof(cmpXX+ret) == 6: A3 * 6 = (A3 * 3) * 2
6330 %endif
6331 lea T1, [T1 + T0*2]
6332 IBT_NOTRACK
6333 call T1
6334 movdqu [A1], xmm0
6335
6336 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6337 IEMIMPL_SSE_EPILOGUE
6338 EPILOGUE_4_ARGS
6339 %assign bImm 0
6340 %rep 256
6341.imm %+ bImm:
6342 IBT_ENDBRxx_WITHOUT_NOTRACK
6343 %1 xmm0, xmm1, bImm
6344 ret
6345 %assign bImm bImm + 1
6346 %endrep
6347.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6348ENDPROC iemAImpl_ %+ %1 %+ _u128
6349%endmacro
6350
6351IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6352IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6353IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6354
6355;;
6356; SSE instructions with 8-bit immediates of the form
6357; xxx xmm1, xmm2, imm8.
6358; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6359; register.
6360;
6361; @param 1 The instruction name.
6362;
6363; @param A0 Pointer to the MXCSR value (input/output).
6364; @param A1 Pointer to the first media register size operand (output).
6365; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6366; @param A3 The 8-bit immediate (input).
6367;
6368%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6369BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6370 PROLOGUE_4_ARGS
6371 IEMIMPL_SSE_PROLOGUE
6372 SSE_LD_FXSTATE_MXCSR_ONLY A0
6373
6374 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6375 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6376 lea T1, [.imm0 xWrtRIP]
6377 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6378 lea T0, [A3 + A3*2] ; sizeof(endbrxx+insn+ret+int3) == 12: A3 * 12 = (A3 * 3) * 4
6379 lea T1, [T1 + T0*4]
6380 %else
6381 lea T1, [T1 + A3*8] ; sizeof(insn+ret+int3) == 8: A3 * 8
6382 %endif
6383 IBT_NOTRACK
6384 call T1
6385 movdqu [A1], xmm0
6386
6387 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6388 IEMIMPL_SSE_EPILOGUE
6389 EPILOGUE_4_ARGS
6390 %assign bImm 0
6391 %rep 256
6392.imm %+ bImm:
6393 IBT_ENDBRxx_WITHOUT_NOTRACK
6394 %1 xmm0, xmm1, bImm
6395 ret
6396 int3
6397 %assign bImm bImm + 1
6398 %endrep
6399.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x800
6400ENDPROC iemAImpl_ %+ %1 %+ _u128
6401%endmacro
6402
6403IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6404IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6405IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6406IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6407IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dpps
6408IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 dppd
6409
6410
6411;;
6412; SSE instructions of the form
6413; xxx mm, xmm.
6414; and we need to load and save the MXCSR register.
6415;
6416; @param 1 The instruction name.
6417;
6418; @param A0 Pointer to the MXCSR value (input/output).
6419; @param A1 Pointer to the first MMX register sized operand (output).
6420; @param A2 Pointer to the media register sized operand (input).
6421;
6422%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6423BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6424 PROLOGUE_3_ARGS
6425 IEMIMPL_SSE_PROLOGUE
6426 SSE_LD_FXSTATE_MXCSR_ONLY A0
6427
6428 movdqu xmm0, [A2]
6429 %1 mm0, xmm0
6430 movq [A1], mm0
6431
6432 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6433 IEMIMPL_SSE_EPILOGUE
6434 EPILOGUE_3_ARGS
6435ENDPROC iemAImpl_ %+ %1 %+ _u128
6436%endmacro
6437
6438IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6439IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6440
6441;;
6442; SSE instructions of the form
6443; xxx xmm, xmm/m64.
6444; and we need to load and save the MXCSR register.
6445;
6446; @param 1 The instruction name.
6447;
6448; @param A0 Pointer to the MXCSR value (input/output).
6449; @param A1 Pointer to the first media register sized operand (input/output).
6450; @param A2 The 64bit source value from a MMX media register (input)
6451;
6452%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6453BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6454 PROLOGUE_3_ARGS
6455 IEMIMPL_SSE_PROLOGUE
6456 SSE_LD_FXSTATE_MXCSR_ONLY A0
6457
6458 movdqu xmm0, [A1]
6459 movq mm0, A2
6460 %1 xmm0, mm0
6461 movdqu [A1], xmm0
6462
6463 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6464 IEMIMPL_SSE_EPILOGUE
6465 EPILOGUE_3_ARGS
6466ENDPROC iemAImpl_ %+ %1 %+ _u128
6467%endmacro
6468
6469IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6470IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6471
6472;;
6473; SSE instructions of the form
6474; xxx mm, xmm/m64.
6475; and we need to load and save the MXCSR register.
6476;
6477; @param 1 The instruction name.
6478;
6479; @param A0 Pointer to the MXCSR value (input/output).
6480; @param A1 Pointer to the first MMX media register sized operand (output).
6481; @param A2 The 64bit source value (input).
6482;
6483%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6484BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6485 PROLOGUE_3_ARGS
6486 IEMIMPL_SSE_PROLOGUE
6487 SSE_LD_FXSTATE_MXCSR_ONLY A0
6488
6489 movq xmm0, A2
6490 %1 mm0, xmm0
6491 movq [A1], mm0
6492
6493 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6494 IEMIMPL_SSE_EPILOGUE
6495 EPILOGUE_3_ARGS
6496ENDPROC iemAImpl_ %+ %1 %+ _u128
6497%endmacro
6498
6499IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6500IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6501
6502;
6503; All forms of RDRAND and RDSEED
6504;
6505; @param A0 Pointer to the destination operand.
6506; @param A1 Pointer to the EFLAGS value (input/output).
6507;
6508%macro IEMIMPL_RDRAND_RDSEED 3
6509BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6510 PROLOGUE_2_ARGS
6511
6512 %1 %2
6513 mov [A0], %2
6514 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6515
6516 EPILOGUE_2_ARGS
6517ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6518%endmacro
6519
6520IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6521IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6522IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6523IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6524IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6525IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6526
6527
6528;;
6529; sha1rnds4 xmm1, xmm2, imm8.
6530;
6531; @param 1 The instruction name.
6532;
6533; @param A0 Pointer to the first media register size operand (input/output).
6534; @param A1 Pointer to the second source media register size operand (input).
6535; @param A2 The 8-bit immediate
6536;
6537BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6538 PROLOGUE_3_ARGS
6539 IEMIMPL_SSE_PROLOGUE
6540
6541 movdqu xmm0, [A0]
6542 movdqu xmm1, [A1]
6543 lea T1, [.imm0 xWrtRIP]
6544 %ifdef RT_WITH_IBT_BRANCH_PROTECTION_WITHOUT_NOTRACK
6545 lea T0, [A2 + A2*4] ; sizeof(endbrxx+sha1rnds4+ret) == 10: A2 * 10 = (A2 * 5) * 2
6546 %else
6547 lea T0, [A2 + A2*2] ; sizeof(sha1rnds4+ret) == 6: A2 * 6 = (A2 * 3) * 2
6548 %endif
6549 lea T1, [T1 + T0*2]
6550 IBT_NOTRACK
6551 call T1
6552 movdqu [A0], xmm0
6553
6554 IEMIMPL_SSE_EPILOGUE
6555 EPILOGUE_3_ARGS
6556 %assign bImm 0
6557 %rep 256
6558.imm %+ bImm:
6559 IBT_ENDBRxx_WITHOUT_NOTRACK
6560 sha1rnds4 xmm0, xmm1, bImm
6561 ret
6562 %assign bImm bImm + 1
6563 %endrep
6564.immEnd: IEMCHECK_256_JUMP_ARRAY_SIZE (.immEnd - .imm0), 0x600
6565ENDPROC iemAImpl_sha1rnds4_u128
6566
6567
6568;;
6569; sha256rnds2 xmm1, xmm2, <XMM0>.
6570;
6571; @param 1 The instruction name.
6572;
6573; @param A0 Pointer to the first media register size operand (input/output).
6574; @param A1 Pointer to the second source media register size operand (input).
6575; @param A2 Pointer to the implicit XMM0 constants (input).
6576;
6577BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6578 PROLOGUE_3_ARGS
6579 IEMIMPL_SSE_PROLOGUE
6580
6581 movdqu xmm0, [A2]
6582 movdqu xmm1, [A0]
6583 movdqu xmm2, [A1]
6584 sha256rnds2 xmm1, xmm2
6585 movdqu [A0], xmm1
6586
6587 IEMIMPL_SSE_EPILOGUE
6588 EPILOGUE_3_ARGS
6589ENDPROC iemAImpl_sha256rnds2_u128
6590
6591
6592;
6593; 32-bit forms of ADCX and ADOX
6594;
6595; @param A0 Pointer to the destination operand (input/output).
6596; @param A1 Pointer to the EFLAGS value (input/output).
6597; @param A2 32-bit source operand 1 (input).
6598;
6599%macro IEMIMPL_ADX_32 2
6600BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6601 PROLOGUE_4_ARGS
6602
6603 IEM_LOAD_FLAGS A1, %2, 0
6604 %1 A2_32, [A0]
6605 mov [A0], A2_32
6606 IEM_SAVE_FLAGS A1, %2, 0
6607
6608 EPILOGUE_4_ARGS
6609ENDPROC iemAImpl_ %+ %1 %+ _u32
6610%endmacro
6611
6612;
6613; 64-bit forms of ADCX and ADOX
6614;
6615; @param A0 Pointer to the destination operand (input/output).
6616; @param A1 Pointer to the EFLAGS value (input/output).
6617; @param A2 64-bit source operand 1 (input).
6618;
6619%macro IEMIMPL_ADX_64 2
6620BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6621 PROLOGUE_4_ARGS
6622
6623 IEM_LOAD_FLAGS A1, %2, 0
6624 %1 A2, [A0]
6625 mov [A0], A2
6626 IEM_SAVE_FLAGS A1, %2, 0
6627
6628 EPILOGUE_4_ARGS
6629ENDPROC iemAImpl_ %+ %1 %+ _u64
6630%endmacro
6631
6632IEMIMPL_ADX_32 adcx, X86_EFL_CF
6633IEMIMPL_ADX_64 adcx, X86_EFL_CF
6634
6635IEMIMPL_ADX_32 adox, X86_EFL_OF
6636IEMIMPL_ADX_64 adox, X86_EFL_OF
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette