VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllAImpl.asm@ 98873

最後變更 在這個檔案從98873是 98827,由 vboxsync 提交於 2 年 前

VMM/IEM: Implement adcx/adox instructions emulation, bugref:9898

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 184.9 KB
 
1; $Id: IEMAllAImpl.asm 98827 2023-03-03 12:01:42Z vboxsync $
2;; @file
3; IEM - Instruction Implementation in Assembly.
4;
5
6;
7; Copyright (C) 2011-2023 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.alldomusa.eu.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; SPDX-License-Identifier: GPL-3.0-only
26;
27
28
29;*********************************************************************************************************************************
30;* Header Files *
31;*********************************************************************************************************************************
32%include "VBox/asmdefs.mac"
33%include "VBox/err.mac"
34%include "iprt/x86.mac"
35
36
37;*********************************************************************************************************************************
38;* Defined Constants And Macros *
39;*********************************************************************************************************************************
40
41;;
42; RET XX / RET wrapper for fastcall.
43;
44%macro RET_FASTCALL 1
45%ifdef RT_ARCH_X86
46 %ifdef RT_OS_WINDOWS
47 ret %1
48 %else
49 ret
50 %endif
51%else
52 ret
53%endif
54%endmacro
55
56;;
57; NAME for fastcall functions.
58;
59;; @todo 'global @fastcall@12' is still broken in yasm and requires dollar
60; escaping (or whatever the dollar is good for here). Thus the ugly
61; prefix argument.
62;
63%define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) NAME(a_Name)
64%ifdef RT_ARCH_X86
65 %ifdef RT_OS_WINDOWS
66 %undef NAME_FASTCALL
67 %define NAME_FASTCALL(a_Name, a_cbArgs, a_Prefix) a_Prefix %+ a_Name %+ @ %+ a_cbArgs
68 %endif
69%endif
70
71;;
72; BEGINPROC for fastcall functions.
73;
74; @param 1 The function name (C).
75; @param 2 The argument size on x86.
76;
77%macro BEGINPROC_FASTCALL 2
78 %ifdef ASM_FORMAT_PE
79 export %1=NAME_FASTCALL(%1,%2,$@)
80 %endif
81 %ifdef __NASM__
82 %ifdef ASM_FORMAT_OMF
83 export NAME(%1) NAME_FASTCALL(%1,%2,$@)
84 %endif
85 %endif
86 %ifndef ASM_FORMAT_BIN
87 global NAME_FASTCALL(%1,%2,$@)
88 %endif
89NAME_FASTCALL(%1,%2,@):
90%endmacro
91
92
93;
94; We employ some macro assembly here to hid the calling convention differences.
95;
96%ifdef RT_ARCH_AMD64
97 %macro PROLOGUE_1_ARGS 0
98 %endmacro
99 %macro EPILOGUE_1_ARGS 0
100 ret
101 %endmacro
102 %macro EPILOGUE_1_ARGS_EX 0
103 ret
104 %endmacro
105
106 %macro PROLOGUE_2_ARGS 0
107 %endmacro
108 %macro EPILOGUE_2_ARGS 0
109 ret
110 %endmacro
111 %macro EPILOGUE_2_ARGS_EX 1
112 ret
113 %endmacro
114
115 %macro PROLOGUE_3_ARGS 0
116 %endmacro
117 %macro EPILOGUE_3_ARGS 0
118 ret
119 %endmacro
120 %macro EPILOGUE_3_ARGS_EX 1
121 ret
122 %endmacro
123
124 %macro PROLOGUE_4_ARGS 0
125 %endmacro
126 %macro EPILOGUE_4_ARGS 0
127 ret
128 %endmacro
129 %macro EPILOGUE_4_ARGS_EX 1
130 ret
131 %endmacro
132
133 %ifdef ASM_CALL64_GCC
134 %define A0 rdi
135 %define A0_32 edi
136 %define A0_16 di
137 %define A0_8 dil
138
139 %define A1 rsi
140 %define A1_32 esi
141 %define A1_16 si
142 %define A1_8 sil
143
144 %define A2 rdx
145 %define A2_32 edx
146 %define A2_16 dx
147 %define A2_8 dl
148
149 %define A3 rcx
150 %define A3_32 ecx
151 %define A3_16 cx
152 %endif
153
154 %ifdef ASM_CALL64_MSC
155 %define A0 rcx
156 %define A0_32 ecx
157 %define A0_16 cx
158 %define A0_8 cl
159
160 %define A1 rdx
161 %define A1_32 edx
162 %define A1_16 dx
163 %define A1_8 dl
164
165 %define A2 r8
166 %define A2_32 r8d
167 %define A2_16 r8w
168 %define A2_8 r8b
169
170 %define A3 r9
171 %define A3_32 r9d
172 %define A3_16 r9w
173 %endif
174
175 %define T0 rax
176 %define T0_32 eax
177 %define T0_16 ax
178 %define T0_8 al
179
180 %define T1 r11
181 %define T1_32 r11d
182 %define T1_16 r11w
183 %define T1_8 r11b
184
185 %define T2 r10 ; only AMD64
186 %define T2_32 r10d
187 %define T2_16 r10w
188 %define T2_8 r10b
189
190%else
191 ; x86
192 %macro PROLOGUE_1_ARGS 0
193 push edi
194 %endmacro
195 %macro EPILOGUE_1_ARGS 0
196 pop edi
197 ret 0
198 %endmacro
199 %macro EPILOGUE_1_ARGS_EX 1
200 pop edi
201 ret %1
202 %endmacro
203
204 %macro PROLOGUE_2_ARGS 0
205 push edi
206 %endmacro
207 %macro EPILOGUE_2_ARGS 0
208 pop edi
209 ret 0
210 %endmacro
211 %macro EPILOGUE_2_ARGS_EX 1
212 pop edi
213 ret %1
214 %endmacro
215
216 %macro PROLOGUE_3_ARGS 0
217 push ebx
218 mov ebx, [esp + 4 + 4]
219 push edi
220 %endmacro
221 %macro EPILOGUE_3_ARGS_EX 1
222 %if (%1) < 4
223 %error "With three args, at least 4 bytes must be remove from the stack upon return (32-bit)."
224 %endif
225 pop edi
226 pop ebx
227 ret %1
228 %endmacro
229 %macro EPILOGUE_3_ARGS 0
230 EPILOGUE_3_ARGS_EX 4
231 %endmacro
232
233 %macro PROLOGUE_4_ARGS 0
234 push ebx
235 push edi
236 push esi
237 mov ebx, [esp + 12 + 4 + 0]
238 mov esi, [esp + 12 + 4 + 4]
239 %endmacro
240 %macro EPILOGUE_4_ARGS_EX 1
241 %if (%1) < 8
242 %error "With four args, at least 8 bytes must be remove from the stack upon return (32-bit)."
243 %endif
244 pop esi
245 pop edi
246 pop ebx
247 ret %1
248 %endmacro
249 %macro EPILOGUE_4_ARGS 0
250 EPILOGUE_4_ARGS_EX 8
251 %endmacro
252
253 %define A0 ecx
254 %define A0_32 ecx
255 %define A0_16 cx
256 %define A0_8 cl
257
258 %define A1 edx
259 %define A1_32 edx
260 %define A1_16 dx
261 %define A1_8 dl
262
263 %define A2 ebx
264 %define A2_32 ebx
265 %define A2_16 bx
266 %define A2_8 bl
267
268 %define A3 esi
269 %define A3_32 esi
270 %define A3_16 si
271
272 %define T0 eax
273 %define T0_32 eax
274 %define T0_16 ax
275 %define T0_8 al
276
277 %define T1 edi
278 %define T1_32 edi
279 %define T1_16 di
280%endif
281
282
283;;
284; Load the relevant flags from [%1] if there are undefined flags (%3).
285;
286; @remarks Clobbers T0, stack. Changes EFLAGS.
287; @param A2 The register pointing to the flags.
288; @param 1 The parameter (A0..A3) pointing to the eflags.
289; @param 2 The set of modified flags.
290; @param 3 The set of undefined flags.
291;
292%macro IEM_MAYBE_LOAD_FLAGS 3
293 ;%if (%3) != 0
294 pushf ; store current flags
295 mov T0_32, [%1] ; load the guest flags
296 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
297 and T0_32, (%2 | %3) ; select the modified and undefined flags.
298 or [xSP], T0 ; merge guest flags with host flags.
299 popf ; load the mixed flags.
300 ;%endif
301%endmacro
302
303;;
304; Load the relevant flags from [%1].
305;
306; @remarks Clobbers T0, stack. Changes EFLAGS.
307; @param A2 The register pointing to the flags.
308; @param 1 The parameter (A0..A3) pointing to the eflags.
309; @param 2 The set of flags to load.
310; @param 3 The set of undefined flags.
311;
312%macro IEM_LOAD_FLAGS 3
313 pushf ; store current flags
314 mov T0_32, [%1] ; load the guest flags
315 and dword [xSP], ~(%2 | %3) ; mask out the modified and undefined flags
316 and T0_32, (%2 | %3) ; select the modified and undefined flags.
317 or [xSP], T0 ; merge guest flags with host flags.
318 popf ; load the mixed flags.
319%endmacro
320
321;;
322; Update the flag.
323;
324; @remarks Clobbers T0, T1, stack.
325; @param 1 The register pointing to the EFLAGS.
326; @param 2 The mask of modified flags to save.
327; @param 3 The mask of undefined flags to (maybe) save.
328;
329%macro IEM_SAVE_FLAGS 3
330 %if (%2 | %3) != 0
331 pushf
332 pop T1
333 mov T0_32, [%1] ; flags
334 and T0_32, ~(%2 | %3) ; clear the modified & undefined flags.
335 and T1_32, (%2 | %3) ; select the modified and undefined flags.
336 or T0_32, T1_32 ; combine the flags.
337 mov [%1], T0_32 ; save the flags.
338 %endif
339%endmacro
340
341;;
342; Calculates the new EFLAGS based on the CPU EFLAGS and fixed clear and set bit masks.
343;
344; @remarks Clobbers T0, T1, stack.
345; @param 1 The register pointing to the EFLAGS.
346; @param 2 The mask of modified flags to save.
347; @param 3 Mask of additional flags to always clear
348; @param 4 Mask of additional flags to always set.
349;
350%macro IEM_SAVE_AND_ADJUST_FLAGS 4
351 %if (%2 | %3 | %4) != 0
352 pushf
353 pop T1
354 mov T0_32, [%1] ; load flags.
355 and T0_32, ~(%2 | %3) ; clear the modified and always cleared flags.
356 and T1_32, (%2) ; select the modified flags.
357 or T0_32, T1_32 ; combine the flags.
358 %if (%4) != 0
359 or T0_32, %4 ; add the always set flags.
360 %endif
361 mov [%1], T0_32 ; save the result.
362 %endif
363%endmacro
364
365;;
366; Calculates the new EFLAGS based on the CPU EFLAGS (%2), a clear mask (%3),
367; signed input (%4[%5]) and parity index (%6).
368;
369; This is used by MUL and IMUL, where we got result (%4 & %6) in xAX which is
370; also T0. So, we have to use T1 for the EFLAGS calculation and save T0/xAX
371; while we extract the %2 flags from the CPU EFLAGS or use T2 (only AMD64).
372;
373; @remarks Clobbers T0, T1, stack, %6, EFLAGS.
374; @param 1 The register pointing to the EFLAGS.
375; @param 2 The mask of modified flags to save.
376; @param 3 Mask of additional flags to always clear
377; @param 4 The result register to set SF by.
378; @param 5 The width of the %4 register in bits (8, 16, 32, or 64).
379; @param 6 The (full) register containing the parity table index. Will be modified!
380
381%macro IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF 6
382 %ifdef RT_ARCH_AMD64
383 pushf
384 pop T2
385 %else
386 push T0
387 pushf
388 pop T0
389 %endif
390 mov T1_32, [%1] ; load flags.
391 and T1_32, ~(%2 | %3 | X86_EFL_PF | X86_EFL_SF) ; clear the modified, always cleared flags and the two flags we calc.
392 %ifdef RT_ARCH_AMD64
393 and T2_32, (%2) ; select the modified flags.
394 or T1_32, T2_32 ; combine the flags.
395 %else
396 and T0_32, (%2) ; select the modified flags.
397 or T1_32, T0_32 ; combine the flags.
398 pop T0
399 %endif
400
401 ; First calculate SF as it's likely to be refereing to the same register as %6 does.
402 bt %4, %5 - 1
403 jnc %%sf_clear
404 or T1_32, X86_EFL_SF
405 %%sf_clear:
406
407 ; Parity last.
408 and %6, 0xff
409 %ifdef RT_ARCH_AMD64
410 lea T2, [NAME(g_afParity) xWrtRIP]
411 or T1_8, [T2 + %6]
412 %else
413 or T1_8, [NAME(g_afParity) + %6]
414 %endif
415
416 mov [%1], T1_32 ; save the result.
417%endmacro
418
419;;
420; Calculates the new EFLAGS using fixed clear and set bit masks.
421;
422; @remarks Clobbers T0.
423; @param 1 The register pointing to the EFLAGS.
424; @param 2 Mask of additional flags to always clear
425; @param 3 Mask of additional flags to always set.
426;
427%macro IEM_ADJUST_FLAGS 3
428 %if (%2 | %3) != 0
429 mov T0_32, [%1] ; Load flags.
430 %if (%2) != 0
431 and T0_32, ~(%2) ; Remove the always cleared flags.
432 %endif
433 %if (%3) != 0
434 or T0_32, %3 ; Add the always set flags.
435 %endif
436 mov [%1], T0_32 ; Save the result.
437 %endif
438%endmacro
439
440;;
441; Calculates the new EFLAGS using fixed clear and set bit masks.
442;
443; @remarks Clobbers T0, %4, EFLAGS.
444; @param 1 The register pointing to the EFLAGS.
445; @param 2 Mask of additional flags to always clear
446; @param 3 Mask of additional flags to always set.
447; @param 4 The (full) register containing the parity table index. Will be modified!
448;
449%macro IEM_ADJUST_FLAGS_WITH_PARITY 4
450 mov T0_32, [%1] ; Load flags.
451 and T0_32, ~(%2 | X86_EFL_PF) ; Remove PF and the always cleared flags.
452 %if (%3) != 0
453 or T0_32, %3 ; Add the always set flags.
454 %endif
455 and %4, 0xff
456 %ifdef RT_ARCH_AMD64
457 lea T2, [NAME(g_afParity) xWrtRIP]
458 or T0_8, [T2 + %4]
459 %else
460 or T0_8, [NAME(g_afParity) + %4]
461 %endif
462 mov [%1], T0_32 ; Save the result.
463%endmacro
464
465
466;*********************************************************************************************************************************
467;* External Symbols *
468;*********************************************************************************************************************************
469extern NAME(g_afParity)
470
471
472;;
473; Macro for implementing a binary operator.
474;
475; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
476; variants, except on 32-bit system where the 64-bit accesses requires hand
477; coding.
478;
479; All the functions takes a pointer to the destination memory operand in A0,
480; the source register operand in A1 and a pointer to eflags in A2.
481;
482; @param 1 The instruction mnemonic.
483; @param 2 Non-zero if there should be a locked version.
484; @param 3 The modified flags.
485; @param 4 The undefined flags.
486;
487%macro IEMIMPL_BIN_OP 4
488BEGINCODE
489BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
490 PROLOGUE_3_ARGS
491 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
492 %1 byte [A0], A1_8
493 IEM_SAVE_FLAGS A2, %3, %4
494 EPILOGUE_3_ARGS
495ENDPROC iemAImpl_ %+ %1 %+ _u8
496
497BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
498 PROLOGUE_3_ARGS
499 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
500 %1 word [A0], A1_16
501 IEM_SAVE_FLAGS A2, %3, %4
502 EPILOGUE_3_ARGS
503ENDPROC iemAImpl_ %+ %1 %+ _u16
504
505BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
506 PROLOGUE_3_ARGS
507 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
508 %1 dword [A0], A1_32
509 IEM_SAVE_FLAGS A2, %3, %4
510 EPILOGUE_3_ARGS
511ENDPROC iemAImpl_ %+ %1 %+ _u32
512
513 %ifdef RT_ARCH_AMD64
514BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
515 PROLOGUE_3_ARGS
516 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
517 %1 qword [A0], A1
518 IEM_SAVE_FLAGS A2, %3, %4
519 EPILOGUE_3_ARGS_EX 8
520ENDPROC iemAImpl_ %+ %1 %+ _u64
521 %endif ; RT_ARCH_AMD64
522
523 %if %2 != 0 ; locked versions requested?
524
525BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 12
526 PROLOGUE_3_ARGS
527 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
528 lock %1 byte [A0], A1_8
529 IEM_SAVE_FLAGS A2, %3, %4
530 EPILOGUE_3_ARGS
531ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
532
533BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
534 PROLOGUE_3_ARGS
535 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
536 lock %1 word [A0], A1_16
537 IEM_SAVE_FLAGS A2, %3, %4
538 EPILOGUE_3_ARGS
539ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
540
541BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
542 PROLOGUE_3_ARGS
543 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
544 lock %1 dword [A0], A1_32
545 IEM_SAVE_FLAGS A2, %3, %4
546 EPILOGUE_3_ARGS
547ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
548
549 %ifdef RT_ARCH_AMD64
550BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
551 PROLOGUE_3_ARGS
552 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
553 lock %1 qword [A0], A1
554 IEM_SAVE_FLAGS A2, %3, %4
555 EPILOGUE_3_ARGS_EX 8
556ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
557 %endif ; RT_ARCH_AMD64
558 %endif ; locked
559%endmacro
560
561; instr,lock, modified-flags, undefined flags
562IEMIMPL_BIN_OP add, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
563IEMIMPL_BIN_OP adc, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
564IEMIMPL_BIN_OP sub, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
565IEMIMPL_BIN_OP sbb, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
566IEMIMPL_BIN_OP or, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
567IEMIMPL_BIN_OP xor, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
568IEMIMPL_BIN_OP and, 1, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
569IEMIMPL_BIN_OP cmp, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
570IEMIMPL_BIN_OP test, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), X86_EFL_AF
571
572
573;;
574; Macro for implementing a binary operator, VEX variant with separate input/output.
575;
576; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
577; where the 64-bit accesses requires hand coding.
578;
579; All the functions takes a pointer to the destination memory operand in A0,
580; the first source register operand in A1, the second source register operand
581; in A2 and a pointer to eflags in A3.
582;
583; @param 1 The instruction mnemonic.
584; @param 2 The modified flags.
585; @param 3 The undefined flags.
586;
587%macro IEMIMPL_VEX_BIN_OP 3
588BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
589 PROLOGUE_4_ARGS
590 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
591 %1 T0_32, A1_32, A2_32
592 mov [A0], T0_32
593 IEM_SAVE_FLAGS A3, %2, %3
594 EPILOGUE_4_ARGS
595ENDPROC iemAImpl_ %+ %1 %+ _u32
596
597 %ifdef RT_ARCH_AMD64
598BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
599 PROLOGUE_4_ARGS
600 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
601 %1 T0, A1, A2
602 mov [A0], T0
603 IEM_SAVE_FLAGS A3, %2, %3
604 EPILOGUE_4_ARGS
605ENDPROC iemAImpl_ %+ %1 %+ _u64
606 %endif ; RT_ARCH_AMD64
607%endmacro
608
609; instr, modified-flags, undefined-flags
610IEMIMPL_VEX_BIN_OP andn, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
611IEMIMPL_VEX_BIN_OP bextr, (X86_EFL_OF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_AF | X86_EFL_PF)
612IEMIMPL_VEX_BIN_OP bzhi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
613
614;;
615; Macro for implementing BLSR, BLCMSK and BLSI (fallbacks implemented in C).
616;
617; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
618; where the 64-bit accesses requires hand coding.
619;
620; All the functions takes a pointer to the destination memory operand in A0,
621; the source register operand in A1 and a pointer to eflags in A2.
622;
623; @param 1 The instruction mnemonic.
624; @param 2 The modified flags.
625; @param 3 The undefined flags.
626;
627%macro IEMIMPL_VEX_BIN_OP_2 3
628BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
629 PROLOGUE_4_ARGS
630 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
631 mov T0_32, [A0]
632 %1 T0_32, A1_32
633 mov [A0], T0_32
634 IEM_SAVE_FLAGS A2, %2, %3
635 EPILOGUE_4_ARGS
636ENDPROC iemAImpl_ %+ %1 %+ _u32
637
638 %ifdef RT_ARCH_AMD64
639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
640 PROLOGUE_4_ARGS
641 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
642 mov T0, [A0]
643 %1 T0, A1
644 mov [A0], T0
645 IEM_SAVE_FLAGS A2, %2, %3
646 EPILOGUE_4_ARGS
647ENDPROC iemAImpl_ %+ %1 %+ _u64
648 %endif ; RT_ARCH_AMD64
649%endmacro
650
651; instr, modified-flags, undefined-flags
652IEMIMPL_VEX_BIN_OP_2 blsr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
653IEMIMPL_VEX_BIN_OP_2 blsmsk, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
654IEMIMPL_VEX_BIN_OP_2 blsi, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_CF), (X86_EFL_AF | X86_EFL_PF)
655
656
657;;
658; Macro for implementing a binary operator w/o flags, VEX variant with separate input/output.
659;
660; This will generate code for the 32 and 64 bit accesses, except on 32-bit system
661; where the 64-bit accesses requires hand coding.
662;
663; All the functions takes a pointer to the destination memory operand in A0,
664; the first source register operand in A1, the second source register operand
665; in A2 and a pointer to eflags in A3.
666;
667; @param 1 The instruction mnemonic.
668; @param 2 Fallback instruction if applicable.
669; @param 3 Whether to emit fallback or not.
670;
671%macro IEMIMPL_VEX_BIN_OP_NOEFL 3
672BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
673 PROLOGUE_3_ARGS
674 %1 T0_32, A1_32, A2_32
675 mov [A0], T0_32
676 EPILOGUE_3_ARGS
677ENDPROC iemAImpl_ %+ %1 %+ _u32
678
679 %if %3
680BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_fallback, 12
681 PROLOGUE_3_ARGS
682 %ifdef ASM_CALL64_GCC
683 mov cl, A2_8
684 %2 A1_32, cl
685 mov [A0], A1_32
686 %else
687 xchg A2, A0
688 %2 A1_32, cl
689 mov [A2], A1_32
690 %endif
691 EPILOGUE_3_ARGS
692ENDPROC iemAImpl_ %+ %1 %+ _u32_fallback
693 %endif
694
695 %ifdef RT_ARCH_AMD64
696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
697 PROLOGUE_3_ARGS
698 %1 T0, A1, A2
699 mov [A0], T0
700 EPILOGUE_3_ARGS
701ENDPROC iemAImpl_ %+ %1 %+ _u64
702
703 %if %3
704BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_fallback, 12
705 PROLOGUE_3_ARGS
706 %ifdef ASM_CALL64_GCC
707 mov cl, A2_8
708 %2 A1, cl
709 mov [A0], A1_32
710 %else
711 xchg A2, A0
712 %2 A1, cl
713 mov [A2], A1_32
714 %endif
715 mov [A0], A1
716 EPILOGUE_3_ARGS
717ENDPROC iemAImpl_ %+ %1 %+ _u64_fallback
718 %endif
719 %endif ; RT_ARCH_AMD64
720%endmacro
721
722; instr, fallback instr, emit fallback
723IEMIMPL_VEX_BIN_OP_NOEFL sarx, sar, 1
724IEMIMPL_VEX_BIN_OP_NOEFL shlx, shl, 1
725IEMIMPL_VEX_BIN_OP_NOEFL shrx, shr, 1
726IEMIMPL_VEX_BIN_OP_NOEFL pdep, nop, 0
727IEMIMPL_VEX_BIN_OP_NOEFL pext, nop, 0
728
729
730;
731; RORX uses a immediate byte for the shift count, so we only do
732; fallback implementation of that one.
733;
734BEGINPROC_FASTCALL iemAImpl_rorx_u32, 12
735 PROLOGUE_3_ARGS
736 %ifdef ASM_CALL64_GCC
737 mov cl, A2_8
738 ror A1_32, cl
739 mov [A0], A1_32
740 %else
741 xchg A2, A0
742 ror A1_32, cl
743 mov [A2], A1_32
744 %endif
745 EPILOGUE_3_ARGS
746ENDPROC iemAImpl_rorx_u32
747
748 %ifdef RT_ARCH_AMD64
749BEGINPROC_FASTCALL iemAImpl_rorx_u64, 12
750 PROLOGUE_3_ARGS
751 %ifdef ASM_CALL64_GCC
752 mov cl, A2_8
753 ror A1, cl
754 mov [A0], A1_32
755 %else
756 xchg A2, A0
757 ror A1, cl
758 mov [A2], A1_32
759 %endif
760 mov [A0], A1
761 EPILOGUE_3_ARGS
762ENDPROC iemAImpl_rorx_u64
763 %endif ; RT_ARCH_AMD64
764
765
766;
767; MULX
768;
769BEGINPROC_FASTCALL iemAImpl_mulx_u32, 16
770 PROLOGUE_4_ARGS
771%ifdef ASM_CALL64_GCC
772 ; A2_32 is EDX - prefect
773 mulx T0_32, T1_32, A3_32
774 mov [A1], T1_32 ; Low value first, as we should return the high part if same destination registers.
775 mov [A0], T0_32
776%else
777 ; A1 is xDX - must switch A1 and A2, so EDX=uSrc1
778 xchg A1, A2
779 mulx T0_32, T1_32, A3_32
780 mov [A2], T1_32 ; Low value first, as we should return the high part if same destination registers.
781 mov [A0], T0_32
782%endif
783 EPILOGUE_4_ARGS
784ENDPROC iemAImpl_mulx_u32
785
786
787BEGINPROC_FASTCALL iemAImpl_mulx_u32_fallback, 16
788 PROLOGUE_4_ARGS
789%ifdef ASM_CALL64_GCC
790 ; A2_32 is EDX, T0_32 is EAX
791 mov eax, A3_32
792 mul A2_32
793 mov [A1], eax ; Low value first, as we should return the high part if same destination registers.
794 mov [A0], edx
795%else
796 ; A1 is xDX, T0_32 is EAX - must switch A1 and A2, so EDX=uSrc1
797 xchg A1, A2
798 mov eax, A3_32
799 mul A2_32
800 mov [A2], eax ; Low value first, as we should return the high part if same destination registers.
801 mov [A0], edx
802%endif
803 EPILOGUE_4_ARGS
804ENDPROC iemAImpl_mulx_u32_fallback
805
806%ifdef RT_ARCH_AMD64
807BEGINPROC_FASTCALL iemAImpl_mulx_u64, 16
808 PROLOGUE_4_ARGS
809%ifdef ASM_CALL64_GCC
810 ; A2 is RDX - prefect
811 mulx T0, T1, A3
812 mov [A1], T1 ; Low value first, as we should return the high part if same destination registers.
813 mov [A0], T0
814%else
815 ; A1 is xDX - must switch A1 and A2, so RDX=uSrc1
816 xchg A1, A2
817 mulx T0, T1, A3
818 mov [A2], T1 ; Low value first, as we should return the high part if same destination registers.
819 mov [A0], T0
820%endif
821 EPILOGUE_4_ARGS
822ENDPROC iemAImpl_mulx_u64
823
824
825BEGINPROC_FASTCALL iemAImpl_mulx_u64_fallback, 16
826 PROLOGUE_4_ARGS
827%ifdef ASM_CALL64_GCC
828 ; A2 is RDX, T0 is RAX
829 mov rax, A3
830 mul A2
831 mov [A1], rax ; Low value first, as we should return the high part if same destination registers.
832 mov [A0], rdx
833%else
834 ; A1 is xDX, T0 is RAX - must switch A1 and A2, so RDX=uSrc1
835 xchg A1, A2
836 mov rax, A3
837 mul A2
838 mov [A2], rax ; Low value first, as we should return the high part if same destination registers.
839 mov [A0], rdx
840%endif
841 EPILOGUE_4_ARGS
842ENDPROC iemAImpl_mulx_u64_fallback
843
844%endif
845
846
847;;
848; Macro for implementing a bit operator.
849;
850; This will generate code for the 16, 32 and 64 bit accesses with locked
851; variants, except on 32-bit system where the 64-bit accesses requires hand
852; coding.
853;
854; All the functions takes a pointer to the destination memory operand in A0,
855; the source register operand in A1 and a pointer to eflags in A2.
856;
857; @param 1 The instruction mnemonic.
858; @param 2 Non-zero if there should be a locked version.
859; @param 3 The modified flags.
860; @param 4 The undefined flags.
861;
862%macro IEMIMPL_BIT_OP 4
863BEGINCODE
864BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
865 PROLOGUE_3_ARGS
866 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
867 %1 word [A0], A1_16
868 IEM_SAVE_FLAGS A2, %3, %4
869 EPILOGUE_3_ARGS
870ENDPROC iemAImpl_ %+ %1 %+ _u16
871
872BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
873 PROLOGUE_3_ARGS
874 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
875 %1 dword [A0], A1_32
876 IEM_SAVE_FLAGS A2, %3, %4
877 EPILOGUE_3_ARGS
878ENDPROC iemAImpl_ %+ %1 %+ _u32
879
880 %ifdef RT_ARCH_AMD64
881BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
882 PROLOGUE_3_ARGS
883 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
884 %1 qword [A0], A1
885 IEM_SAVE_FLAGS A2, %3, %4
886 EPILOGUE_3_ARGS_EX 8
887ENDPROC iemAImpl_ %+ %1 %+ _u64
888 %endif ; RT_ARCH_AMD64
889
890 %if %2 != 0 ; locked versions requested?
891
892BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 12
893 PROLOGUE_3_ARGS
894 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
895 lock %1 word [A0], A1_16
896 IEM_SAVE_FLAGS A2, %3, %4
897 EPILOGUE_3_ARGS
898ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
899
900BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 12
901 PROLOGUE_3_ARGS
902 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
903 lock %1 dword [A0], A1_32
904 IEM_SAVE_FLAGS A2, %3, %4
905 EPILOGUE_3_ARGS
906ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
907
908 %ifdef RT_ARCH_AMD64
909BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 16
910 PROLOGUE_3_ARGS
911 IEM_MAYBE_LOAD_FLAGS A2, %3, %4
912 lock %1 qword [A0], A1
913 IEM_SAVE_FLAGS A2, %3, %4
914 EPILOGUE_3_ARGS_EX 8
915ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
916 %endif ; RT_ARCH_AMD64
917 %endif ; locked
918%endmacro
919IEMIMPL_BIT_OP bt, 0, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
920IEMIMPL_BIT_OP btc, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
921IEMIMPL_BIT_OP bts, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
922IEMIMPL_BIT_OP btr, 1, (X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF)
923
924;;
925; Macro for implementing a bit search operator.
926;
927; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
928; system where the 64-bit accesses requires hand coding.
929;
930; All the functions takes a pointer to the destination memory operand in A0,
931; the source register operand in A1 and a pointer to eflags in A2.
932;
933; In the ZF case the destination register is 'undefined', however it seems that
934; both AMD and Intel just leaves it as is. The undefined EFLAGS differs between
935; AMD and Intel and accoridng to https://www.sandpile.org/x86/flags.htm between
936; Intel microarchitectures. We only implement 'intel' and 'amd' variation with
937; the behaviour of more recent CPUs (Intel 10980X and AMD 3990X).
938;
939; @param 1 The instruction mnemonic.
940; @param 2 The modified flags.
941; @param 3 The undefined flags.
942; @param 4 Non-zero if destination isn't written when ZF=1. Zero if always written.
943;
944%macro IEMIMPL_BIT_OP2 4
945BEGINCODE
946BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
947 PROLOGUE_3_ARGS
948 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
949 %1 T0_16, A1_16
950%if %4 != 0
951 jz .unchanged_dst
952%endif
953 mov [A0], T0_16
954.unchanged_dst:
955 IEM_SAVE_FLAGS A2, %2, %3
956 EPILOGUE_3_ARGS
957ENDPROC iemAImpl_ %+ %1 %+ _u16
958
959BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _intel, 12
960 PROLOGUE_3_ARGS
961 %1 T1_16, A1_16
962%if %4 != 0
963 jz .unchanged_dst
964%endif
965 mov [A0], T1_16
966 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
967 EPILOGUE_3_ARGS
968.unchanged_dst:
969 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
970 EPILOGUE_3_ARGS
971ENDPROC iemAImpl_ %+ %1 %+ _u16_intel
972
973BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ _amd, 12
974 PROLOGUE_3_ARGS
975 %1 T0_16, A1_16
976%if %4 != 0
977 jz .unchanged_dst
978%endif
979 mov [A0], T0_16
980.unchanged_dst:
981 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
982 EPILOGUE_3_ARGS
983ENDPROC iemAImpl_ %+ %1 %+ _u16_amd
984
985
986BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
987 PROLOGUE_3_ARGS
988 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
989 %1 T0_32, A1_32
990%if %4 != 0
991 jz .unchanged_dst
992%endif
993 mov [A0], T0_32
994.unchanged_dst:
995 IEM_SAVE_FLAGS A2, %2, %3
996 EPILOGUE_3_ARGS
997ENDPROC iemAImpl_ %+ %1 %+ _u32
998
999BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _intel, 12
1000 PROLOGUE_3_ARGS
1001 %1 T1_32, A1_32
1002%if %4 != 0
1003 jz .unchanged_dst
1004%endif
1005 mov [A0], T1_32
1006 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1007 EPILOGUE_3_ARGS
1008.unchanged_dst:
1009 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1010 EPILOGUE_3_ARGS
1011ENDPROC iemAImpl_ %+ %1 %+ _u32_intel
1012
1013BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ _amd, 12
1014 PROLOGUE_3_ARGS
1015 %1 T0_32, A1_32
1016%if %4 != 0
1017 jz .unchanged_dst
1018%endif
1019 mov [A0], T0_32
1020.unchanged_dst:
1021 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1022 EPILOGUE_3_ARGS
1023ENDPROC iemAImpl_ %+ %1 %+ _u32_amd
1024
1025
1026 %ifdef RT_ARCH_AMD64
1027
1028BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1029 PROLOGUE_3_ARGS
1030 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1031 %1 T0, A1
1032%if %4 != 0
1033 jz .unchanged_dst
1034%endif
1035 mov [A0], T0
1036.unchanged_dst:
1037 IEM_SAVE_FLAGS A2, %2, %3
1038 EPILOGUE_3_ARGS_EX 8
1039ENDPROC iemAImpl_ %+ %1 %+ _u64
1040
1041BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _intel, 16
1042 PROLOGUE_3_ARGS
1043 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1044 %1 T1, A1
1045%if %4 != 0
1046 jz .unchanged_dst
1047%endif
1048 mov [A0], T1
1049 IEM_ADJUST_FLAGS_WITH_PARITY A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF | X86_EFL_ZF, 0, T1
1050 EPILOGUE_3_ARGS
1051.unchanged_dst:
1052 IEM_ADJUST_FLAGS A2, X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_CF, X86_EFL_ZF | X86_EFL_PF
1053 EPILOGUE_3_ARGS
1054ENDPROC iemAImpl_ %+ %1 %+ _u64_intel
1055
1056BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ _amd, 16
1057 PROLOGUE_3_ARGS
1058 %1 T0, A1
1059%if %4 != 0
1060 jz .unchanged_dst
1061%endif
1062 mov [A0], T0
1063.unchanged_dst:
1064 IEM_SAVE_AND_ADJUST_FLAGS A2, %2, 0, 0 ; Only the ZF flag is modified on AMD Zen 2.
1065 EPILOGUE_3_ARGS_EX 8
1066ENDPROC iemAImpl_ %+ %1 %+ _u64_amd
1067
1068 %endif ; RT_ARCH_AMD64
1069%endmacro
1070
1071IEMIMPL_BIT_OP2 bsf, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1072IEMIMPL_BIT_OP2 bsr, (X86_EFL_ZF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1
1073IEMIMPL_BIT_OP2 tzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1074IEMIMPL_BIT_OP2 lzcnt, (X86_EFL_ZF | X86_EFL_CF), (X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1075
1076
1077;;
1078; Macro for implementing POPCNT.
1079;
1080; This will generate code for the 16, 32 and 64 bit accesses, except on 32-bit
1081; system where the 64-bit accesses requires hand coding.
1082;
1083; All the functions takes a pointer to the destination memory operand in A0,
1084; the source register operand in A1 and a pointer to eflags in A2.
1085;
1086; ASSUMES Intel and AMD set EFLAGS the same way.
1087;
1088; ASSUMES the instruction does not support memory destination.
1089;
1090; @param 1 The instruction mnemonic.
1091; @param 2 The modified flags.
1092; @param 3 The undefined flags.
1093;
1094%macro IEMIMPL_BIT_OP3 3
1095BEGINCODE
1096BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1097 PROLOGUE_3_ARGS
1098 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1099 %1 T0_16, A1_16
1100 mov [A0], T0_16
1101 IEM_SAVE_FLAGS A2, %2, %3
1102 EPILOGUE_3_ARGS
1103ENDPROC iemAImpl_ %+ %1 %+ _u16
1104
1105BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1106 PROLOGUE_3_ARGS
1107 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1108 %1 T0_32, A1_32
1109 mov [A0], T0_32
1110 IEM_SAVE_FLAGS A2, %2, %3
1111 EPILOGUE_3_ARGS
1112ENDPROC iemAImpl_ %+ %1 %+ _u32
1113
1114 %ifdef RT_ARCH_AMD64
1115BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 16
1116 PROLOGUE_3_ARGS
1117 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1118 %1 T0, A1
1119 mov [A0], T0
1120 IEM_SAVE_FLAGS A2, %2, %3
1121 EPILOGUE_3_ARGS_EX 8
1122ENDPROC iemAImpl_ %+ %1 %+ _u64
1123 %endif ; RT_ARCH_AMD64
1124%endmacro
1125IEMIMPL_BIT_OP3 popcnt, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_OF | X86_EFL_SF | X86_EFL_AF | X86_EFL_PF), 0
1126
1127
1128;
1129; IMUL is also a similar but yet different case (no lock, no mem dst).
1130; The rDX:rAX variant of imul is handled together with mul further down.
1131;
1132BEGINCODE
1133; @param 1 EFLAGS that are modified.
1134; @param 2 Undefined EFLAGS.
1135; @param 3 Function suffix.
1136; @param 4 EFLAGS variation: 0 for native, 1 for intel (ignored),
1137; 2 for AMD (set AF, clear PF, ZF and SF).
1138%macro IEMIMPL_IMUL_TWO 4
1139BEGINPROC_FASTCALL iemAImpl_imul_two_u16 %+ %3, 12
1140 PROLOGUE_3_ARGS
1141 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1142 imul A1_16, word [A0]
1143 mov [A0], A1_16
1144 %if %4 != 1
1145 IEM_SAVE_FLAGS A2, %1, %2
1146 %else
1147 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_16, 16, A1
1148 %endif
1149 EPILOGUE_3_ARGS
1150ENDPROC iemAImpl_imul_two_u16 %+ %3
1151
1152BEGINPROC_FASTCALL iemAImpl_imul_two_u32 %+ %3, 12
1153 PROLOGUE_3_ARGS
1154 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1155 imul A1_32, dword [A0]
1156 mov [A0], A1_32
1157 %if %4 != 1
1158 IEM_SAVE_FLAGS A2, %1, %2
1159 %else
1160 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1_32, 32, A1
1161 %endif
1162 EPILOGUE_3_ARGS
1163ENDPROC iemAImpl_imul_two_u32 %+ %3
1164
1165 %ifdef RT_ARCH_AMD64
1166BEGINPROC_FASTCALL iemAImpl_imul_two_u64 %+ %3, 16
1167 PROLOGUE_3_ARGS
1168 IEM_MAYBE_LOAD_FLAGS A2, %1, %2
1169 imul A1, qword [A0]
1170 mov [A0], A1
1171 %if %4 != 1
1172 IEM_SAVE_FLAGS A2, %1, %2
1173 %else
1174 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %1, X86_EFL_AF | X86_EFL_ZF, A1, 64, A1
1175 %endif
1176 EPILOGUE_3_ARGS_EX 8
1177ENDPROC iemAImpl_imul_two_u64 %+ %3
1178 %endif ; RT_ARCH_AMD64
1179%endmacro
1180IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF, , 0
1181IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _intel, 1
1182IEMIMPL_IMUL_TWO X86_EFL_OF | X86_EFL_CF, 0, _amd, 2
1183
1184
1185;
1186; XCHG for memory operands. This implies locking. No flag changes.
1187;
1188; Each function takes two arguments, first the pointer to the memory,
1189; then the pointer to the register. They all return void.
1190;
1191BEGINCODE
1192BEGINPROC_FASTCALL iemAImpl_xchg_u8_locked, 8
1193 PROLOGUE_2_ARGS
1194 mov T0_8, [A1]
1195 xchg [A0], T0_8
1196 mov [A1], T0_8
1197 EPILOGUE_2_ARGS
1198ENDPROC iemAImpl_xchg_u8_locked
1199
1200BEGINPROC_FASTCALL iemAImpl_xchg_u16_locked, 8
1201 PROLOGUE_2_ARGS
1202 mov T0_16, [A1]
1203 xchg [A0], T0_16
1204 mov [A1], T0_16
1205 EPILOGUE_2_ARGS
1206ENDPROC iemAImpl_xchg_u16_locked
1207
1208BEGINPROC_FASTCALL iemAImpl_xchg_u32_locked, 8
1209 PROLOGUE_2_ARGS
1210 mov T0_32, [A1]
1211 xchg [A0], T0_32
1212 mov [A1], T0_32
1213 EPILOGUE_2_ARGS
1214ENDPROC iemAImpl_xchg_u32_locked
1215
1216%ifdef RT_ARCH_AMD64
1217BEGINPROC_FASTCALL iemAImpl_xchg_u64_locked, 8
1218 PROLOGUE_2_ARGS
1219 mov T0, [A1]
1220 xchg [A0], T0
1221 mov [A1], T0
1222 EPILOGUE_2_ARGS
1223ENDPROC iemAImpl_xchg_u64_locked
1224%endif
1225
1226; Unlocked variants for fDisregardLock mode.
1227
1228BEGINPROC_FASTCALL iemAImpl_xchg_u8_unlocked, 8
1229 PROLOGUE_2_ARGS
1230 mov T0_8, [A1]
1231 mov T1_8, [A0]
1232 mov [A0], T0_8
1233 mov [A1], T1_8
1234 EPILOGUE_2_ARGS
1235ENDPROC iemAImpl_xchg_u8_unlocked
1236
1237BEGINPROC_FASTCALL iemAImpl_xchg_u16_unlocked, 8
1238 PROLOGUE_2_ARGS
1239 mov T0_16, [A1]
1240 mov T1_16, [A0]
1241 mov [A0], T0_16
1242 mov [A1], T1_16
1243 EPILOGUE_2_ARGS
1244ENDPROC iemAImpl_xchg_u16_unlocked
1245
1246BEGINPROC_FASTCALL iemAImpl_xchg_u32_unlocked, 8
1247 PROLOGUE_2_ARGS
1248 mov T0_32, [A1]
1249 mov T1_32, [A0]
1250 mov [A0], T0_32
1251 mov [A1], T1_32
1252 EPILOGUE_2_ARGS
1253ENDPROC iemAImpl_xchg_u32_unlocked
1254
1255%ifdef RT_ARCH_AMD64
1256BEGINPROC_FASTCALL iemAImpl_xchg_u64_unlocked, 8
1257 PROLOGUE_2_ARGS
1258 mov T0, [A1]
1259 mov T1, [A0]
1260 mov [A0], T0
1261 mov [A1], T1
1262 EPILOGUE_2_ARGS
1263ENDPROC iemAImpl_xchg_u64_unlocked
1264%endif
1265
1266
1267;
1268; XADD for memory operands.
1269;
1270; Each function takes three arguments, first the pointer to the
1271; memory/register, then the pointer to the register, and finally a pointer to
1272; eflags. They all return void.
1273;
1274BEGINCODE
1275BEGINPROC_FASTCALL iemAImpl_xadd_u8, 12
1276 PROLOGUE_3_ARGS
1277 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1278 mov T0_8, [A1]
1279 xadd [A0], T0_8
1280 mov [A1], T0_8
1281 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1282 EPILOGUE_3_ARGS
1283ENDPROC iemAImpl_xadd_u8
1284
1285BEGINPROC_FASTCALL iemAImpl_xadd_u16, 12
1286 PROLOGUE_3_ARGS
1287 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1288 mov T0_16, [A1]
1289 xadd [A0], T0_16
1290 mov [A1], T0_16
1291 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1292 EPILOGUE_3_ARGS
1293ENDPROC iemAImpl_xadd_u16
1294
1295BEGINPROC_FASTCALL iemAImpl_xadd_u32, 12
1296 PROLOGUE_3_ARGS
1297 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1298 mov T0_32, [A1]
1299 xadd [A0], T0_32
1300 mov [A1], T0_32
1301 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1302 EPILOGUE_3_ARGS
1303ENDPROC iemAImpl_xadd_u32
1304
1305%ifdef RT_ARCH_AMD64
1306BEGINPROC_FASTCALL iemAImpl_xadd_u64, 12
1307 PROLOGUE_3_ARGS
1308 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1309 mov T0, [A1]
1310 xadd [A0], T0
1311 mov [A1], T0
1312 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1313 EPILOGUE_3_ARGS
1314ENDPROC iemAImpl_xadd_u64
1315%endif ; RT_ARCH_AMD64
1316
1317BEGINPROC_FASTCALL iemAImpl_xadd_u8_locked, 12
1318 PROLOGUE_3_ARGS
1319 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1320 mov T0_8, [A1]
1321 lock xadd [A0], T0_8
1322 mov [A1], T0_8
1323 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1324 EPILOGUE_3_ARGS
1325ENDPROC iemAImpl_xadd_u8_locked
1326
1327BEGINPROC_FASTCALL iemAImpl_xadd_u16_locked, 12
1328 PROLOGUE_3_ARGS
1329 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1330 mov T0_16, [A1]
1331 lock xadd [A0], T0_16
1332 mov [A1], T0_16
1333 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1334 EPILOGUE_3_ARGS
1335ENDPROC iemAImpl_xadd_u16_locked
1336
1337BEGINPROC_FASTCALL iemAImpl_xadd_u32_locked, 12
1338 PROLOGUE_3_ARGS
1339 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1340 mov T0_32, [A1]
1341 lock xadd [A0], T0_32
1342 mov [A1], T0_32
1343 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1344 EPILOGUE_3_ARGS
1345ENDPROC iemAImpl_xadd_u32_locked
1346
1347%ifdef RT_ARCH_AMD64
1348BEGINPROC_FASTCALL iemAImpl_xadd_u64_locked, 12
1349 PROLOGUE_3_ARGS
1350 IEM_MAYBE_LOAD_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1351 mov T0, [A1]
1352 lock xadd [A0], T0
1353 mov [A1], T0
1354 IEM_SAVE_FLAGS A2, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1355 EPILOGUE_3_ARGS
1356ENDPROC iemAImpl_xadd_u64_locked
1357%endif ; RT_ARCH_AMD64
1358
1359
1360;
1361; CMPXCHG8B.
1362;
1363; These are tricky register wise, so the code is duplicated for each calling
1364; convention.
1365;
1366; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1367;
1368; C-proto:
1369; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg8b,(uint64_t *pu64Dst, PRTUINT64U pu64EaxEdx, PRTUINT64U pu64EbxEcx,
1370; uint32_t *pEFlags));
1371;
1372; Note! Identical to iemAImpl_cmpxchg16b.
1373;
1374BEGINCODE
1375BEGINPROC_FASTCALL iemAImpl_cmpxchg8b, 16
1376%ifdef RT_ARCH_AMD64
1377 %ifdef ASM_CALL64_MSC
1378 push rbx
1379
1380 mov r11, rdx ; pu64EaxEdx (is also T1)
1381 mov r10, rcx ; pu64Dst
1382
1383 mov ebx, [r8]
1384 mov ecx, [r8 + 4]
1385 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1386 mov eax, [r11]
1387 mov edx, [r11 + 4]
1388
1389 lock cmpxchg8b [r10]
1390
1391 mov [r11], eax
1392 mov [r11 + 4], edx
1393 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1394
1395 pop rbx
1396 ret
1397 %else
1398 push rbx
1399
1400 mov r10, rcx ; pEFlags
1401 mov r11, rdx ; pu64EbxEcx (is also T1)
1402
1403 mov ebx, [r11]
1404 mov ecx, [r11 + 4]
1405 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1406 mov eax, [rsi]
1407 mov edx, [rsi + 4]
1408
1409 lock cmpxchg8b [rdi]
1410
1411 mov [rsi], eax
1412 mov [rsi + 4], edx
1413 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1414
1415 pop rbx
1416 ret
1417
1418 %endif
1419%else
1420 push esi
1421 push edi
1422 push ebx
1423 push ebp
1424
1425 mov edi, ecx ; pu64Dst
1426 mov esi, edx ; pu64EaxEdx
1427 mov ecx, [esp + 16 + 4 + 0] ; pu64EbxEcx
1428 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1429
1430 mov ebx, [ecx]
1431 mov ecx, [ecx + 4]
1432 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1433 mov eax, [esi]
1434 mov edx, [esi + 4]
1435
1436 lock cmpxchg8b [edi]
1437
1438 mov [esi], eax
1439 mov [esi + 4], edx
1440 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, edi)
1441
1442 pop ebp
1443 pop ebx
1444 pop edi
1445 pop esi
1446 ret 8
1447%endif
1448ENDPROC iemAImpl_cmpxchg8b
1449
1450BEGINPROC_FASTCALL iemAImpl_cmpxchg8b_locked, 16
1451 ; Lazy bird always lock prefixes cmpxchg8b.
1452 jmp NAME_FASTCALL(iemAImpl_cmpxchg8b,16,$@)
1453ENDPROC iemAImpl_cmpxchg8b_locked
1454
1455%ifdef RT_ARCH_AMD64
1456
1457;
1458; CMPXCHG16B.
1459;
1460; These are tricky register wise, so the code is duplicated for each calling
1461; convention.
1462;
1463; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1464;
1465; C-proto:
1466; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg16b,(PRTUINT128U pu128Dst, PRTUINT128U pu1284RaxRdx, PRTUINT128U pu128RbxRcx,
1467; uint32_t *pEFlags));
1468;
1469; Note! Identical to iemAImpl_cmpxchg8b.
1470;
1471BEGINCODE
1472BEGINPROC_FASTCALL iemAImpl_cmpxchg16b, 16
1473 %ifdef ASM_CALL64_MSC
1474 push rbx
1475
1476 mov r11, rdx ; pu64RaxRdx (is also T1)
1477 mov r10, rcx ; pu64Dst
1478
1479 mov rbx, [r8]
1480 mov rcx, [r8 + 8]
1481 IEM_MAYBE_LOAD_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1482 mov rax, [r11]
1483 mov rdx, [r11 + 8]
1484
1485 lock cmpxchg16b [r10]
1486
1487 mov [r11], rax
1488 mov [r11 + 8], rdx
1489 IEM_SAVE_FLAGS r9, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1490
1491 pop rbx
1492 ret
1493 %else
1494 push rbx
1495
1496 mov r10, rcx ; pEFlags
1497 mov r11, rdx ; pu64RbxRcx (is also T1)
1498
1499 mov rbx, [r11]
1500 mov rcx, [r11 + 8]
1501 IEM_MAYBE_LOAD_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0 (eax)
1502 mov rax, [rsi]
1503 mov rdx, [rsi + 8]
1504
1505 lock cmpxchg16b [rdi]
1506
1507 mov [rsi], rax
1508 mov [rsi + 8], rdx
1509 IEM_SAVE_FLAGS r10, (X86_EFL_ZF), 0 ; clobbers T0+T1 (eax, r11)
1510
1511 pop rbx
1512 ret
1513
1514 %endif
1515ENDPROC iemAImpl_cmpxchg16b
1516
1517BEGINPROC_FASTCALL iemAImpl_cmpxchg16b_locked, 16
1518 ; Lazy bird always lock prefixes cmpxchg16b.
1519 jmp NAME_FASTCALL(iemAImpl_cmpxchg16b,16,$@)
1520ENDPROC iemAImpl_cmpxchg16b_locked
1521
1522%endif ; RT_ARCH_AMD64
1523
1524
1525;
1526; CMPXCHG.
1527;
1528; WARNING! This code make ASSUMPTIONS about which registers T1 and T0 are mapped to!
1529;
1530; C-proto:
1531; IEM_DECL_IMPL_DEF(void, iemAImpl_cmpxchg,(uintX_t *puXDst, uintX_t puEax, uintX_t uReg, uint32_t *pEFlags));
1532;
1533BEGINCODE
1534%macro IEMIMPL_CMPXCHG 2
1535BEGINPROC_FASTCALL iemAImpl_cmpxchg_u8 %+ %2, 16
1536 PROLOGUE_4_ARGS
1537 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1538 mov al, [A1]
1539 %1 cmpxchg [A0], A2_8
1540 mov [A1], al
1541 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1542 EPILOGUE_4_ARGS
1543ENDPROC iemAImpl_cmpxchg_u8 %+ %2
1544
1545BEGINPROC_FASTCALL iemAImpl_cmpxchg_u16 %+ %2, 16
1546 PROLOGUE_4_ARGS
1547 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1548 mov ax, [A1]
1549 %1 cmpxchg [A0], A2_16
1550 mov [A1], ax
1551 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1552 EPILOGUE_4_ARGS
1553ENDPROC iemAImpl_cmpxchg_u16 %+ %2
1554
1555BEGINPROC_FASTCALL iemAImpl_cmpxchg_u32 %+ %2, 16
1556 PROLOGUE_4_ARGS
1557 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1558 mov eax, [A1]
1559 %1 cmpxchg [A0], A2_32
1560 mov [A1], eax
1561 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1562 EPILOGUE_4_ARGS
1563ENDPROC iemAImpl_cmpxchg_u32 %+ %2
1564
1565BEGINPROC_FASTCALL iemAImpl_cmpxchg_u64 %+ %2, 16
1566%ifdef RT_ARCH_AMD64
1567 PROLOGUE_4_ARGS
1568 IEM_MAYBE_LOAD_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1569 mov rax, [A1]
1570 %1 cmpxchg [A0], A2
1571 mov [A1], rax
1572 IEM_SAVE_FLAGS A3, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, r11/edi)
1573 EPILOGUE_4_ARGS
1574%else
1575 ;
1576 ; Must use cmpxchg8b here. See also iemAImpl_cmpxchg8b.
1577 ;
1578 push esi
1579 push edi
1580 push ebx
1581 push ebp
1582
1583 mov edi, ecx ; pu64Dst
1584 mov esi, edx ; pu64Rax
1585 mov ecx, [esp + 16 + 4 + 0] ; pu64Reg - Note! Pointer on 32-bit hosts!
1586 mov ebp, [esp + 16 + 4 + 4] ; pEFlags
1587
1588 mov ebx, [ecx]
1589 mov ecx, [ecx + 4]
1590 IEM_MAYBE_LOAD_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0 (eax)
1591 mov eax, [esi]
1592 mov edx, [esi + 4]
1593
1594 lock cmpxchg8b [edi]
1595
1596 ; cmpxchg8b doesn't set CF, PF, AF, SF and OF, so we have to do that.
1597 jz .cmpxchg8b_not_equal
1598 cmp eax, eax ; just set the other flags.
1599.store:
1600 mov [esi], eax
1601 mov [esi + 4], edx
1602 IEM_SAVE_FLAGS ebp, (X86_EFL_ZF | X86_EFL_CF | X86_EFL_PF | X86_EFL_AF | X86_EFL_SF | X86_EFL_OF), 0 ; clobbers T0+T1 (eax, edi)
1603
1604 pop ebp
1605 pop ebx
1606 pop edi
1607 pop esi
1608 ret 8
1609
1610.cmpxchg8b_not_equal:
1611 cmp [esi + 4], edx ;; @todo FIXME - verify 64-bit compare implementation
1612 jne .store
1613 cmp [esi], eax
1614 jmp .store
1615
1616%endif
1617ENDPROC iemAImpl_cmpxchg_u64 %+ %2
1618%endmacro ; IEMIMPL_CMPXCHG
1619
1620IEMIMPL_CMPXCHG , ,
1621IEMIMPL_CMPXCHG lock, _locked
1622
1623;;
1624; Macro for implementing a unary operator.
1625;
1626; This will generate code for the 8, 16, 32 and 64 bit accesses with locked
1627; variants, except on 32-bit system where the 64-bit accesses requires hand
1628; coding.
1629;
1630; All the functions takes a pointer to the destination memory operand in A0,
1631; the source register operand in A1 and a pointer to eflags in A2.
1632;
1633; @param 1 The instruction mnemonic.
1634; @param 2 The modified flags.
1635; @param 3 The undefined flags.
1636;
1637%macro IEMIMPL_UNARY_OP 3
1638BEGINCODE
1639BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 8
1640 PROLOGUE_2_ARGS
1641 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1642 %1 byte [A0]
1643 IEM_SAVE_FLAGS A1, %2, %3
1644 EPILOGUE_2_ARGS
1645ENDPROC iemAImpl_ %+ %1 %+ _u8
1646
1647BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8_locked, 8
1648 PROLOGUE_2_ARGS
1649 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1650 lock %1 byte [A0]
1651 IEM_SAVE_FLAGS A1, %2, %3
1652 EPILOGUE_2_ARGS
1653ENDPROC iemAImpl_ %+ %1 %+ _u8_locked
1654
1655BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 8
1656 PROLOGUE_2_ARGS
1657 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1658 %1 word [A0]
1659 IEM_SAVE_FLAGS A1, %2, %3
1660 EPILOGUE_2_ARGS
1661ENDPROC iemAImpl_ %+ %1 %+ _u16
1662
1663BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16_locked, 8
1664 PROLOGUE_2_ARGS
1665 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1666 lock %1 word [A0]
1667 IEM_SAVE_FLAGS A1, %2, %3
1668 EPILOGUE_2_ARGS
1669ENDPROC iemAImpl_ %+ %1 %+ _u16_locked
1670
1671BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
1672 PROLOGUE_2_ARGS
1673 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1674 %1 dword [A0]
1675 IEM_SAVE_FLAGS A1, %2, %3
1676 EPILOGUE_2_ARGS
1677ENDPROC iemAImpl_ %+ %1 %+ _u32
1678
1679BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32_locked, 8
1680 PROLOGUE_2_ARGS
1681 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1682 lock %1 dword [A0]
1683 IEM_SAVE_FLAGS A1, %2, %3
1684 EPILOGUE_2_ARGS
1685ENDPROC iemAImpl_ %+ %1 %+ _u32_locked
1686
1687 %ifdef RT_ARCH_AMD64
1688BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
1689 PROLOGUE_2_ARGS
1690 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1691 %1 qword [A0]
1692 IEM_SAVE_FLAGS A1, %2, %3
1693 EPILOGUE_2_ARGS
1694ENDPROC iemAImpl_ %+ %1 %+ _u64
1695
1696BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64_locked, 8
1697 PROLOGUE_2_ARGS
1698 IEM_MAYBE_LOAD_FLAGS A1, %2, %3
1699 lock %1 qword [A0]
1700 IEM_SAVE_FLAGS A1, %2, %3
1701 EPILOGUE_2_ARGS
1702ENDPROC iemAImpl_ %+ %1 %+ _u64_locked
1703 %endif ; RT_ARCH_AMD64
1704
1705%endmacro
1706
1707IEMIMPL_UNARY_OP inc, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1708IEMIMPL_UNARY_OP dec, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), 0
1709IEMIMPL_UNARY_OP neg, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0
1710IEMIMPL_UNARY_OP not, 0, 0
1711
1712
1713;
1714; BSWAP. No flag changes.
1715;
1716; Each function takes one argument, pointer to the value to bswap
1717; (input/output). They all return void.
1718;
1719BEGINPROC_FASTCALL iemAImpl_bswap_u16, 4
1720 PROLOGUE_1_ARGS
1721 mov T0_32, [A0] ; just in case any of the upper bits are used.
1722 db 66h
1723 bswap T0_32
1724 mov [A0], T0_32
1725 EPILOGUE_1_ARGS
1726ENDPROC iemAImpl_bswap_u16
1727
1728BEGINPROC_FASTCALL iemAImpl_bswap_u32, 4
1729 PROLOGUE_1_ARGS
1730 mov T0_32, [A0]
1731 bswap T0_32
1732 mov [A0], T0_32
1733 EPILOGUE_1_ARGS
1734ENDPROC iemAImpl_bswap_u32
1735
1736BEGINPROC_FASTCALL iemAImpl_bswap_u64, 4
1737%ifdef RT_ARCH_AMD64
1738 PROLOGUE_1_ARGS
1739 mov T0, [A0]
1740 bswap T0
1741 mov [A0], T0
1742 EPILOGUE_1_ARGS
1743%else
1744 PROLOGUE_1_ARGS
1745 mov T0, [A0]
1746 mov T1, [A0 + 4]
1747 bswap T0
1748 bswap T1
1749 mov [A0 + 4], T0
1750 mov [A0], T1
1751 EPILOGUE_1_ARGS
1752%endif
1753ENDPROC iemAImpl_bswap_u64
1754
1755
1756;;
1757; Macro for implementing a shift operation.
1758;
1759; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1760; 32-bit system where the 64-bit accesses requires hand coding.
1761;
1762; All the functions takes a pointer to the destination memory operand in A0,
1763; the shift count in A1 and a pointer to eflags in A2.
1764;
1765; @param 1 The instruction mnemonic.
1766; @param 2 The modified flags.
1767; @param 3 The undefined flags.
1768;
1769; Makes ASSUMPTIONS about A0, A1 and A2 assignments.
1770;
1771; @note the _intel and _amd variants are implemented in C.
1772;
1773%macro IEMIMPL_SHIFT_OP 3
1774BEGINCODE
1775BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8, 12
1776 PROLOGUE_3_ARGS
1777 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1778 %ifdef ASM_CALL64_GCC
1779 mov cl, A1_8
1780 %1 byte [A0], cl
1781 %else
1782 xchg A1, A0
1783 %1 byte [A1], cl
1784 %endif
1785 IEM_SAVE_FLAGS A2, %2, %3
1786 EPILOGUE_3_ARGS
1787ENDPROC iemAImpl_ %+ %1 %+ _u8
1788
1789BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 12
1790 PROLOGUE_3_ARGS
1791 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1792 %ifdef ASM_CALL64_GCC
1793 mov cl, A1_8
1794 %1 word [A0], cl
1795 %else
1796 xchg A1, A0
1797 %1 word [A1], cl
1798 %endif
1799 IEM_SAVE_FLAGS A2, %2, %3
1800 EPILOGUE_3_ARGS
1801ENDPROC iemAImpl_ %+ %1 %+ _u16
1802
1803BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 12
1804 PROLOGUE_3_ARGS
1805 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1806 %ifdef ASM_CALL64_GCC
1807 mov cl, A1_8
1808 %1 dword [A0], cl
1809 %else
1810 xchg A1, A0
1811 %1 dword [A1], cl
1812 %endif
1813 IEM_SAVE_FLAGS A2, %2, %3
1814 EPILOGUE_3_ARGS
1815ENDPROC iemAImpl_ %+ %1 %+ _u32
1816
1817 %ifdef RT_ARCH_AMD64
1818BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
1819 PROLOGUE_3_ARGS
1820 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1821 %ifdef ASM_CALL64_GCC
1822 mov cl, A1_8
1823 %1 qword [A0], cl
1824 %else
1825 xchg A1, A0
1826 %1 qword [A1], cl
1827 %endif
1828 IEM_SAVE_FLAGS A2, %2, %3
1829 EPILOGUE_3_ARGS
1830ENDPROC iemAImpl_ %+ %1 %+ _u64
1831 %endif ; RT_ARCH_AMD64
1832
1833%endmacro
1834
1835IEMIMPL_SHIFT_OP rol, (X86_EFL_OF | X86_EFL_CF), 0
1836IEMIMPL_SHIFT_OP ror, (X86_EFL_OF | X86_EFL_CF), 0
1837IEMIMPL_SHIFT_OP rcl, (X86_EFL_OF | X86_EFL_CF), 0
1838IEMIMPL_SHIFT_OP rcr, (X86_EFL_OF | X86_EFL_CF), 0
1839IEMIMPL_SHIFT_OP shl, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1840IEMIMPL_SHIFT_OP shr, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1841IEMIMPL_SHIFT_OP sar, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1842
1843
1844;;
1845; Macro for implementing a double precision shift operation.
1846;
1847; This will generate code for the 16, 32 and 64 bit accesses, except on
1848; 32-bit system where the 64-bit accesses requires hand coding.
1849;
1850; The functions takes the destination operand (r/m) in A0, the source (reg) in
1851; A1, the shift count in A2 and a pointer to the eflags variable/register in A3.
1852;
1853; @param 1 The instruction mnemonic.
1854; @param 2 The modified flags.
1855; @param 3 The undefined flags.
1856;
1857; Makes ASSUMPTIONS about A0, A1, A2 and A3 assignments.
1858;
1859; @note the _intel and _amd variants are implemented in C.
1860;
1861%macro IEMIMPL_SHIFT_DBL_OP 3
1862BEGINCODE
1863BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16, 16
1864 PROLOGUE_4_ARGS
1865 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1866 %ifdef ASM_CALL64_GCC
1867 xchg A3, A2
1868 %1 [A0], A1_16, cl
1869 xchg A3, A2
1870 %else
1871 xchg A0, A2
1872 %1 [A2], A1_16, cl
1873 %endif
1874 IEM_SAVE_FLAGS A3, %2, %3
1875 EPILOGUE_4_ARGS
1876ENDPROC iemAImpl_ %+ %1 %+ _u16
1877
1878BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 16
1879 PROLOGUE_4_ARGS
1880 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1881 %ifdef ASM_CALL64_GCC
1882 xchg A3, A2
1883 %1 [A0], A1_32, cl
1884 xchg A3, A2
1885 %else
1886 xchg A0, A2
1887 %1 [A2], A1_32, cl
1888 %endif
1889 IEM_SAVE_FLAGS A3, %2, %3
1890 EPILOGUE_4_ARGS
1891ENDPROC iemAImpl_ %+ %1 %+ _u32
1892
1893 %ifdef RT_ARCH_AMD64
1894BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 20
1895 PROLOGUE_4_ARGS
1896 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1897 %ifdef ASM_CALL64_GCC
1898 xchg A3, A2
1899 %1 [A0], A1, cl
1900 xchg A3, A2
1901 %else
1902 xchg A0, A2
1903 %1 [A2], A1, cl
1904 %endif
1905 IEM_SAVE_FLAGS A3, %2, %3
1906 EPILOGUE_4_ARGS_EX 12
1907ENDPROC iemAImpl_ %+ %1 %+ _u64
1908 %endif ; RT_ARCH_AMD64
1909
1910%endmacro
1911
1912IEMIMPL_SHIFT_DBL_OP shld, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1913IEMIMPL_SHIFT_DBL_OP shrd, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_PF | X86_EFL_CF), (X86_EFL_AF)
1914
1915
1916;;
1917; Macro for implementing a multiplication operations.
1918;
1919; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
1920; 32-bit system where the 64-bit accesses requires hand coding.
1921;
1922; The 8-bit function only operates on AX, so it takes no DX pointer. The other
1923; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
1924; pointer to eflags in A3.
1925;
1926; The functions all return 0 so the caller can be used for div/idiv as well as
1927; for the mul/imul implementation.
1928;
1929; @param 1 The instruction mnemonic.
1930; @param 2 The modified flags.
1931; @param 3 The undefined flags.
1932; @param 4 Name suffix.
1933; @param 5 EFLAGS behaviour: 0 for native, 1 for intel and 2 for AMD.
1934;
1935; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
1936;
1937%macro IEMIMPL_MUL_OP 5
1938BEGINCODE
1939BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %4, 12
1940 PROLOGUE_3_ARGS
1941 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
1942 mov al, [A0]
1943 %1 A1_8
1944 mov [A0], ax
1945 %if %5 != 1
1946 IEM_SAVE_FLAGS A2, %2, %3
1947 %else
1948 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A2, %2, X86_EFL_AF | X86_EFL_ZF, ax, 8, xAX
1949 %endif
1950 xor eax, eax
1951 EPILOGUE_3_ARGS
1952ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %4
1953
1954BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %4, 16
1955 PROLOGUE_4_ARGS
1956 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1957 mov ax, [A0]
1958 %ifdef ASM_CALL64_GCC
1959 %1 A2_16
1960 mov [A0], ax
1961 mov [A1], dx
1962 %else
1963 mov T1, A1
1964 %1 A2_16
1965 mov [A0], ax
1966 mov [T1], dx
1967 %endif
1968 %if %5 != 1
1969 IEM_SAVE_FLAGS A3, %2, %3
1970 %else
1971 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, ax, 16, xAX
1972 %endif
1973 xor eax, eax
1974 EPILOGUE_4_ARGS
1975ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %4
1976
1977BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %4, 16
1978 PROLOGUE_4_ARGS
1979 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
1980 mov eax, [A0]
1981 %ifdef ASM_CALL64_GCC
1982 %1 A2_32
1983 mov [A0], eax
1984 mov [A1], edx
1985 %else
1986 mov T1, A1
1987 %1 A2_32
1988 mov [A0], eax
1989 mov [T1], edx
1990 %endif
1991 %if %5 != 1
1992 IEM_SAVE_FLAGS A3, %2, %3
1993 %else
1994 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, eax, 32, xAX
1995 %endif
1996 xor eax, eax
1997 EPILOGUE_4_ARGS
1998ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %4
1999
2000 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2001BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %4, 20
2002 PROLOGUE_4_ARGS
2003 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2004 mov rax, [A0]
2005 %ifdef ASM_CALL64_GCC
2006 %1 A2
2007 mov [A0], rax
2008 mov [A1], rdx
2009 %else
2010 mov T1, A1
2011 %1 A2
2012 mov [A0], rax
2013 mov [T1], rdx
2014 %endif
2015 %if %5 != 1
2016 IEM_SAVE_FLAGS A3, %2, %3
2017 %else
2018 IEM_SAVE_FLAGS_ADJUST_AND_CALC_SF_PF A3, %2, X86_EFL_AF | X86_EFL_ZF, rax, 64, xAX
2019 %endif
2020 xor eax, eax
2021 EPILOGUE_4_ARGS_EX 12
2022ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %4
2023 %endif ; !RT_ARCH_AMD64
2024
2025%endmacro
2026
2027IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2028IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2029IEMIMPL_MUL_OP mul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2030IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), (X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF), , 0
2031IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _intel, 1
2032IEMIMPL_MUL_OP imul, (X86_EFL_OF | X86_EFL_CF), 0, _amd, 2
2033
2034
2035BEGINCODE
2036;;
2037; Worker function for negating a 32-bit number in T1:T0
2038; @uses None (T0,T1)
2039BEGINPROC iemAImpl_negate_T0_T1_u32
2040 push 0
2041 push 0
2042 xchg T0_32, [xSP]
2043 xchg T1_32, [xSP + xCB]
2044 sub T0_32, [xSP]
2045 sbb T1_32, [xSP + xCB]
2046 add xSP, xCB*2
2047 ret
2048ENDPROC iemAImpl_negate_T0_T1_u32
2049
2050%ifdef RT_ARCH_AMD64
2051;;
2052; Worker function for negating a 64-bit number in T1:T0
2053; @uses None (T0,T1)
2054BEGINPROC iemAImpl_negate_T0_T1_u64
2055 push 0
2056 push 0
2057 xchg T0, [xSP]
2058 xchg T1, [xSP + xCB]
2059 sub T0, [xSP]
2060 sbb T1, [xSP + xCB]
2061 add xSP, xCB*2
2062 ret
2063ENDPROC iemAImpl_negate_T0_T1_u64
2064%endif
2065
2066
2067;;
2068; Macro for implementing a division operations.
2069;
2070; This will generate code for the 8, 16, 32 and 64 bit accesses, except on
2071; 32-bit system where the 64-bit accesses requires hand coding.
2072;
2073; The 8-bit function only operates on AX, so it takes no DX pointer. The other
2074; functions takes a pointer to rAX in A0, rDX in A1, the operand in A2 and a
2075; pointer to eflags in A3.
2076;
2077; The functions all return 0 on success and -1 if a divide error should be
2078; raised by the caller.
2079;
2080; @param 1 The instruction mnemonic.
2081; @param 2 The modified flags.
2082; @param 3 The undefined flags.
2083; @param 4 1 if signed, 0 if unsigned.
2084; @param 5 Function suffix.
2085; @param 6 EFLAGS variation: 0 for native, 1 for intel (ignored),
2086; 2 for AMD (set AF, clear PF, ZF and SF).
2087;
2088; Makes ASSUMPTIONS about A0, A1, A2, A3, T0 and T1 assignments.
2089;
2090%macro IEMIMPL_DIV_OP 6
2091BEGINCODE
2092BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u8 %+ %5, 12
2093 PROLOGUE_3_ARGS
2094
2095 ; div by chainsaw check.
2096 test A1_8, A1_8
2097 jz .div_zero
2098
2099 ; Overflow check - unsigned division is simple to verify, haven't
2100 ; found a simple way to check signed division yet unfortunately.
2101 %if %4 == 0
2102 cmp [A0 + 1], A1_8
2103 jae .div_overflow
2104 %else
2105 mov T0_16, [A0] ; T0 = dividend
2106 mov T1, A1 ; T1 = saved divisor (because of missing T1_8 in 32-bit)
2107 test A1_8, A1_8
2108 js .divisor_negative
2109 test T0_16, T0_16
2110 jns .both_positive
2111 neg T0_16
2112.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2113 push T0 ; Start off like unsigned below.
2114 shr T0_16, 7
2115 cmp T0_8, A1_8
2116 pop T0
2117 jb .div_no_overflow
2118 ja .div_overflow
2119 and T0_8, 0x7f ; Special case for covering (divisor - 1).
2120 cmp T0_8, A1_8
2121 jae .div_overflow
2122 jmp .div_no_overflow
2123
2124.divisor_negative:
2125 neg A1_8
2126 test T0_16, T0_16
2127 jns .one_of_each
2128 neg T0_16
2129.both_positive: ; Same as unsigned shifted by sign indicator bit.
2130 shr T0_16, 7
2131 cmp T0_8, A1_8
2132 jae .div_overflow
2133.div_no_overflow:
2134 mov A1, T1 ; restore divisor
2135 %endif
2136
2137 IEM_MAYBE_LOAD_FLAGS A2, %2, %3
2138 mov ax, [A0]
2139 %1 A1_8
2140 mov [A0], ax
2141 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2142 IEM_ADJUST_FLAGS A2, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2143 %else
2144 IEM_SAVE_FLAGS A2, %2, %3
2145 %endif
2146 xor eax, eax
2147
2148.return:
2149 EPILOGUE_3_ARGS
2150
2151.div_zero:
2152.div_overflow:
2153 mov eax, -1
2154 jmp .return
2155ENDPROC iemAImpl_ %+ %1 %+ _u8 %+ %5
2156
2157BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u16 %+ %5, 16
2158 PROLOGUE_4_ARGS
2159
2160 ; div by chainsaw check.
2161 test A2_16, A2_16
2162 jz .div_zero
2163
2164 ; Overflow check - unsigned division is simple to verify, haven't
2165 ; found a simple way to check signed division yet unfortunately.
2166 %if %4 == 0
2167 cmp [A1], A2_16
2168 jae .div_overflow
2169 %else
2170 mov T0_16, [A1]
2171 shl T0_32, 16
2172 mov T0_16, [A0] ; T0 = dividend
2173 mov T1, A2 ; T1 = divisor
2174 test T1_16, T1_16
2175 js .divisor_negative
2176 test T0_32, T0_32
2177 jns .both_positive
2178 neg T0_32
2179.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2180 push T0 ; Start off like unsigned below.
2181 shr T0_32, 15
2182 cmp T0_16, T1_16
2183 pop T0
2184 jb .div_no_overflow
2185 ja .div_overflow
2186 and T0_16, 0x7fff ; Special case for covering (divisor - 1).
2187 cmp T0_16, T1_16
2188 jae .div_overflow
2189 jmp .div_no_overflow
2190
2191.divisor_negative:
2192 neg T1_16
2193 test T0_32, T0_32
2194 jns .one_of_each
2195 neg T0_32
2196.both_positive: ; Same as unsigned shifted by sign indicator bit.
2197 shr T0_32, 15
2198 cmp T0_16, T1_16
2199 jae .div_overflow
2200.div_no_overflow:
2201 %endif
2202
2203 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2204 %ifdef ASM_CALL64_GCC
2205 mov T1, A2
2206 mov ax, [A0]
2207 mov dx, [A1]
2208 %1 T1_16
2209 mov [A0], ax
2210 mov [A1], dx
2211 %else
2212 mov T1, A1
2213 mov ax, [A0]
2214 mov dx, [T1]
2215 %1 A2_16
2216 mov [A0], ax
2217 mov [T1], dx
2218 %endif
2219 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2220 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2221 %else
2222 IEM_SAVE_FLAGS A3, %2, %3
2223 %endif
2224 xor eax, eax
2225
2226.return:
2227 EPILOGUE_4_ARGS
2228
2229.div_zero:
2230.div_overflow:
2231 mov eax, -1
2232 jmp .return
2233ENDPROC iemAImpl_ %+ %1 %+ _u16 %+ %5
2234
2235BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32 %+ %5, 16
2236 PROLOGUE_4_ARGS
2237
2238 ; div by chainsaw check.
2239 test A2_32, A2_32
2240 jz .div_zero
2241
2242 ; Overflow check - unsigned division is simple to verify, haven't
2243 ; found a simple way to check signed division yet unfortunately.
2244 %if %4 == 0
2245 cmp [A1], A2_32
2246 jae .div_overflow
2247 %else
2248 push A2 ; save A2 so we modify it (we out of regs on x86).
2249 mov T0_32, [A0] ; T0 = dividend low
2250 mov T1_32, [A1] ; T1 = dividend high
2251 test A2_32, A2_32
2252 js .divisor_negative
2253 test T1_32, T1_32
2254 jns .both_positive
2255 call NAME(iemAImpl_negate_T0_T1_u32)
2256.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2257 push T0 ; Start off like unsigned below.
2258 shl T1_32, 1
2259 shr T0_32, 31
2260 or T1_32, T0_32
2261 cmp T1_32, A2_32
2262 pop T0
2263 jb .div_no_overflow
2264 ja .div_overflow
2265 and T0_32, 0x7fffffff ; Special case for covering (divisor - 1).
2266 cmp T0_32, A2_32
2267 jae .div_overflow
2268 jmp .div_no_overflow
2269
2270.divisor_negative:
2271 neg A2_32
2272 test T1_32, T1_32
2273 jns .one_of_each
2274 call NAME(iemAImpl_negate_T0_T1_u32)
2275.both_positive: ; Same as unsigned shifted by sign indicator bit.
2276 shl T1_32, 1
2277 shr T0_32, 31
2278 or T1_32, T0_32
2279 cmp T1_32, A2_32
2280 jae .div_overflow
2281.div_no_overflow:
2282 pop A2
2283 %endif
2284
2285 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2286 mov eax, [A0]
2287 %ifdef ASM_CALL64_GCC
2288 mov T1, A2
2289 mov eax, [A0]
2290 mov edx, [A1]
2291 %1 T1_32
2292 mov [A0], eax
2293 mov [A1], edx
2294 %else
2295 mov T1, A1
2296 mov eax, [A0]
2297 mov edx, [T1]
2298 %1 A2_32
2299 mov [A0], eax
2300 mov [T1], edx
2301 %endif
2302 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2303 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2304 %else
2305 IEM_SAVE_FLAGS A3, %2, %3
2306 %endif
2307 xor eax, eax
2308
2309.return:
2310 EPILOGUE_4_ARGS
2311
2312.div_overflow:
2313 %if %4 != 0
2314 pop A2
2315 %endif
2316.div_zero:
2317 mov eax, -1
2318 jmp .return
2319ENDPROC iemAImpl_ %+ %1 %+ _u32 %+ %5
2320
2321 %ifdef RT_ARCH_AMD64 ; The 32-bit host version lives in IEMAllAImplC.cpp.
2322BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64 %+ %5, 20
2323 PROLOGUE_4_ARGS
2324
2325 test A2, A2
2326 jz .div_zero
2327 %if %4 == 0
2328 cmp [A1], A2
2329 jae .div_overflow
2330 %else
2331 push A2 ; save A2 so we modify it (we out of regs on x86).
2332 mov T0, [A0] ; T0 = dividend low
2333 mov T1, [A1] ; T1 = dividend high
2334 test A2, A2
2335 js .divisor_negative
2336 test T1, T1
2337 jns .both_positive
2338 call NAME(iemAImpl_negate_T0_T1_u64)
2339.one_of_each: ; OK range is 2^(result-with - 1) + (divisor - 1).
2340 push T0 ; Start off like unsigned below.
2341 shl T1, 1
2342 shr T0, 63
2343 or T1, T0
2344 cmp T1, A2
2345 pop T0
2346 jb .div_no_overflow
2347 ja .div_overflow
2348 mov T1, 0x7fffffffffffffff
2349 and T0, T1 ; Special case for covering (divisor - 1).
2350 cmp T0, A2
2351 jae .div_overflow
2352 jmp .div_no_overflow
2353
2354.divisor_negative:
2355 neg A2
2356 test T1, T1
2357 jns .one_of_each
2358 call NAME(iemAImpl_negate_T0_T1_u64)
2359.both_positive: ; Same as unsigned shifted by sign indicator bit.
2360 shl T1, 1
2361 shr T0, 63
2362 or T1, T0
2363 cmp T1, A2
2364 jae .div_overflow
2365.div_no_overflow:
2366 pop A2
2367 %endif
2368
2369 IEM_MAYBE_LOAD_FLAGS A3, %2, %3
2370 mov rax, [A0]
2371 %ifdef ASM_CALL64_GCC
2372 mov T1, A2
2373 mov rax, [A0]
2374 mov rdx, [A1]
2375 %1 T1
2376 mov [A0], rax
2377 mov [A1], rdx
2378 %else
2379 mov T1, A1
2380 mov rax, [A0]
2381 mov rdx, [T1]
2382 %1 A2
2383 mov [A0], rax
2384 mov [T1], rdx
2385 %endif
2386 %if %6 == 2 ; AMD64 3990X: Set AF and clear PF, ZF and SF.
2387 IEM_ADJUST_FLAGS A3, X86_EFL_PF | X86_EFL_ZF | X86_EFL_SF, X86_EFL_AF
2388 %else
2389 IEM_SAVE_FLAGS A3, %2, %3
2390 %endif
2391 xor eax, eax
2392
2393.return:
2394 EPILOGUE_4_ARGS_EX 12
2395
2396.div_overflow:
2397 %if %4 != 0
2398 pop A2
2399 %endif
2400.div_zero:
2401 mov eax, -1
2402 jmp .return
2403ENDPROC iemAImpl_ %+ %1 %+ _u64 %+ %5
2404 %endif ; !RT_ARCH_AMD64
2405
2406%endmacro
2407
2408IEMIMPL_DIV_OP div, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 0, , 0
2409IEMIMPL_DIV_OP div, 0, 0, 0, _intel, 1
2410IEMIMPL_DIV_OP div, 0, 0, 0, _amd, 2
2411IEMIMPL_DIV_OP idiv, 0, (X86_EFL_OF | X86_EFL_SF | X86_EFL_ZF | X86_EFL_AF | X86_EFL_PF | X86_EFL_CF), 1, , 0
2412IEMIMPL_DIV_OP idiv, 0, 0, 1, _intel, 1
2413IEMIMPL_DIV_OP idiv, 0, 0, 1, _amd, 2
2414
2415
2416;;
2417; Macro for implementing memory fence operation.
2418;
2419; No return value, no operands or anything.
2420;
2421; @param 1 The instruction.
2422;
2423%macro IEMIMPL_MEM_FENCE 1
2424BEGINCODE
2425BEGINPROC_FASTCALL iemAImpl_ %+ %1, 0
2426 %1
2427 ret
2428ENDPROC iemAImpl_ %+ %1
2429%endmacro
2430
2431IEMIMPL_MEM_FENCE lfence
2432IEMIMPL_MEM_FENCE sfence
2433IEMIMPL_MEM_FENCE mfence
2434
2435;;
2436; Alternative for non-SSE2 host.
2437;
2438BEGINPROC_FASTCALL iemAImpl_alt_mem_fence, 0
2439 push xAX
2440 xchg xAX, [xSP]
2441 add xSP, xCB
2442 ret
2443ENDPROC iemAImpl_alt_mem_fence
2444
2445
2446;;
2447; Initialize the FPU for the actual instruction being emulated, this means
2448; loading parts of the guest's control word and status word.
2449;
2450; @uses 24 bytes of stack. T0, T1
2451; @param 1 Expression giving the address of the FXSTATE of the guest.
2452;
2453%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW 1
2454 fnstenv [xSP]
2455
2456 ; FCW - for exception, precision and rounding control.
2457 movzx T0, word [%1 + X86FXSTATE.FCW]
2458 and T0, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2459 mov [xSP + X86FSTENV32P.FCW], T0_16
2460
2461 ; FSW - for undefined C0, C1, C2, and C3.
2462 movzx T1, word [%1 + X86FXSTATE.FSW]
2463 and T1, X86_FSW_C_MASK
2464 movzx T0, word [xSP + X86FSTENV32P.FSW]
2465 and T0, X86_FSW_TOP_MASK
2466 or T0, T1
2467 mov [xSP + X86FSTENV32P.FSW], T0_16
2468
2469 fldenv [xSP]
2470%endmacro
2471
2472
2473;;
2474; Initialize the FPU for the actual instruction being emulated, this means
2475; loading parts of the guest's control word, status word, and update the
2476; tag word for the top register if it's empty.
2477;
2478; ASSUMES actual TOP=7
2479;
2480; @uses 24 bytes of stack. T0, T1
2481; @param 1 Expression giving the address of the FXSTATE of the guest.
2482;
2483%macro FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 1
2484 fnstenv [xSP]
2485
2486 ; FCW - for exception, precision and rounding control.
2487 movzx T0_32, word [%1 + X86FXSTATE.FCW]
2488 and T0_32, X86_FCW_MASK_ALL | X86_FCW_PC_MASK | X86_FCW_RC_MASK
2489 mov [xSP + X86FSTENV32P.FCW], T0_16
2490
2491 ; FSW - for undefined C0, C1, C2, and C3.
2492 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2493 and T1_32, X86_FSW_C_MASK
2494 movzx T0_32, word [xSP + X86FSTENV32P.FSW]
2495 and T0_32, X86_FSW_TOP_MASK
2496 or T0_32, T1_32
2497 mov [xSP + X86FSTENV32P.FSW], T0_16
2498
2499 ; FTW - Only for ST0 (in/out).
2500 movzx T1_32, word [%1 + X86FXSTATE.FSW]
2501 shr T1_32, X86_FSW_TOP_SHIFT
2502 and T1_32, X86_FSW_TOP_SMASK
2503 bt [%1 + X86FXSTATE.FTW], T1_16 ; Empty if FTW bit is clear. Fixed register order.
2504 jc %%st0_not_empty
2505 or word [xSP + X86FSTENV32P.FTW], 0c000h ; TOP=7, so set TAG(7)=3
2506%%st0_not_empty:
2507
2508 fldenv [xSP]
2509%endmacro
2510
2511
2512;;
2513; Need to move this as well somewhere better?
2514;
2515struc IEMFPURESULT
2516 .r80Result resw 5
2517 .FSW resw 1
2518endstruc
2519
2520
2521;;
2522; Need to move this as well somewhere better?
2523;
2524struc IEMFPURESULTTWO
2525 .r80Result1 resw 5
2526 .FSW resw 1
2527 .r80Result2 resw 5
2528endstruc
2529
2530
2531;
2532;---------------------- 16-bit signed integer operations ----------------------
2533;
2534
2535
2536;;
2537; Converts a 16-bit floating point value to a 80-bit one (fpu register).
2538;
2539; @param A0 FPU context (fxsave).
2540; @param A1 Pointer to a IEMFPURESULT for the output.
2541; @param A2 Pointer to the 16-bit floating point value to convert.
2542;
2543BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i16, 12
2544 PROLOGUE_3_ARGS
2545 sub xSP, 20h
2546
2547 fninit
2548 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2549 fild word [A2]
2550
2551 fnstsw word [A1 + IEMFPURESULT.FSW]
2552 fnclex
2553 fstp tword [A1 + IEMFPURESULT.r80Result]
2554
2555 fninit
2556 add xSP, 20h
2557 EPILOGUE_3_ARGS
2558ENDPROC iemAImpl_fild_r80_from_i16
2559
2560
2561;;
2562; Store a 80-bit floating point value (register) as a 16-bit signed integer (memory).
2563;
2564; @param A0 FPU context (fxsave).
2565; @param A1 Where to return the output FSW.
2566; @param A2 Where to store the 16-bit signed integer value.
2567; @param A3 Pointer to the 80-bit value.
2568;
2569BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i16, 16
2570 PROLOGUE_4_ARGS
2571 sub xSP, 20h
2572
2573 fninit
2574 fld tword [A3]
2575 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2576 fistp word [A2]
2577
2578 fnstsw word [A1]
2579
2580 fninit
2581 add xSP, 20h
2582 EPILOGUE_4_ARGS
2583ENDPROC iemAImpl_fist_r80_to_i16
2584
2585
2586;;
2587; Store a 80-bit floating point value (register) as a 16-bit signed integer
2588; (memory) with truncation.
2589;
2590; @param A0 FPU context (fxsave).
2591; @param A1 Where to return the output FSW.
2592; @param A2 Where to store the 16-bit signed integer value.
2593; @param A3 Pointer to the 80-bit value.
2594;
2595BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i16, 16
2596 PROLOGUE_4_ARGS
2597 sub xSP, 20h
2598
2599 fninit
2600 fld tword [A3]
2601 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2602 fisttp word [A2]
2603
2604 fnstsw word [A1]
2605
2606 fninit
2607 add xSP, 20h
2608 EPILOGUE_4_ARGS
2609ENDPROC iemAImpl_fistt_r80_to_i16
2610
2611
2612;;
2613; FPU instruction working on one 80-bit and one 16-bit signed integer value.
2614;
2615; @param 1 The instruction
2616;
2617; @param A0 FPU context (fxsave).
2618; @param A1 Pointer to a IEMFPURESULT for the output.
2619; @param A2 Pointer to the 80-bit value.
2620; @param A3 Pointer to the 16-bit value.
2621;
2622%macro IEMIMPL_FPU_R80_BY_I16 1
2623BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2624 PROLOGUE_4_ARGS
2625 sub xSP, 20h
2626
2627 fninit
2628 fld tword [A2]
2629 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2630 %1 word [A3]
2631
2632 fnstsw word [A1 + IEMFPURESULT.FSW]
2633 fnclex
2634 fstp tword [A1 + IEMFPURESULT.r80Result]
2635
2636 fninit
2637 add xSP, 20h
2638 EPILOGUE_4_ARGS
2639ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2640%endmacro
2641
2642IEMIMPL_FPU_R80_BY_I16 fiadd
2643IEMIMPL_FPU_R80_BY_I16 fimul
2644IEMIMPL_FPU_R80_BY_I16 fisub
2645IEMIMPL_FPU_R80_BY_I16 fisubr
2646IEMIMPL_FPU_R80_BY_I16 fidiv
2647IEMIMPL_FPU_R80_BY_I16 fidivr
2648
2649
2650;;
2651; FPU instruction working on one 80-bit and one 16-bit signed integer value,
2652; only returning FSW.
2653;
2654; @param 1 The instruction
2655;
2656; @param A0 FPU context (fxsave).
2657; @param A1 Where to store the output FSW.
2658; @param A2 Pointer to the 80-bit value.
2659; @param A3 Pointer to the 64-bit value.
2660;
2661%macro IEMIMPL_FPU_R80_BY_I16_FSW 1
2662BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i16, 16
2663 PROLOGUE_4_ARGS
2664 sub xSP, 20h
2665
2666 fninit
2667 fld tword [A2]
2668 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2669 %1 word [A3]
2670
2671 fnstsw word [A1]
2672
2673 fninit
2674 add xSP, 20h
2675 EPILOGUE_4_ARGS
2676ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i16
2677%endmacro
2678
2679IEMIMPL_FPU_R80_BY_I16_FSW ficom
2680
2681
2682
2683;
2684;---------------------- 32-bit signed integer operations ----------------------
2685;
2686
2687
2688;;
2689; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2690;
2691; @param A0 FPU context (fxsave).
2692; @param A1 Pointer to a IEMFPURESULT for the output.
2693; @param A2 Pointer to the 32-bit floating point value to convert.
2694;
2695BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i32, 12
2696 PROLOGUE_3_ARGS
2697 sub xSP, 20h
2698
2699 fninit
2700 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2701 fild dword [A2]
2702
2703 fnstsw word [A1 + IEMFPURESULT.FSW]
2704 fnclex
2705 fstp tword [A1 + IEMFPURESULT.r80Result]
2706
2707 fninit
2708 add xSP, 20h
2709 EPILOGUE_3_ARGS
2710ENDPROC iemAImpl_fild_r80_from_i32
2711
2712
2713;;
2714; Store a 80-bit floating point value (register) as a 32-bit signed integer (memory).
2715;
2716; @param A0 FPU context (fxsave).
2717; @param A1 Where to return the output FSW.
2718; @param A2 Where to store the 32-bit signed integer value.
2719; @param A3 Pointer to the 80-bit value.
2720;
2721BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i32, 16
2722 PROLOGUE_4_ARGS
2723 sub xSP, 20h
2724
2725 fninit
2726 fld tword [A3]
2727 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2728 fistp dword [A2]
2729
2730 fnstsw word [A1]
2731
2732 fninit
2733 add xSP, 20h
2734 EPILOGUE_4_ARGS
2735ENDPROC iemAImpl_fist_r80_to_i32
2736
2737
2738;;
2739; Store a 80-bit floating point value (register) as a 32-bit signed integer
2740; (memory) with truncation.
2741;
2742; @param A0 FPU context (fxsave).
2743; @param A1 Where to return the output FSW.
2744; @param A2 Where to store the 32-bit signed integer value.
2745; @param A3 Pointer to the 80-bit value.
2746;
2747BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i32, 16
2748 PROLOGUE_4_ARGS
2749 sub xSP, 20h
2750
2751 fninit
2752 fld tword [A3]
2753 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2754 fisttp dword [A2]
2755
2756 fnstsw word [A1]
2757
2758 fninit
2759 add xSP, 20h
2760 EPILOGUE_4_ARGS
2761ENDPROC iemAImpl_fistt_r80_to_i32
2762
2763
2764;;
2765; FPU instruction working on one 80-bit and one 32-bit signed integer value.
2766;
2767; @param 1 The instruction
2768;
2769; @param A0 FPU context (fxsave).
2770; @param A1 Pointer to a IEMFPURESULT for the output.
2771; @param A2 Pointer to the 80-bit value.
2772; @param A3 Pointer to the 32-bit value.
2773;
2774%macro IEMIMPL_FPU_R80_BY_I32 1
2775BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2776 PROLOGUE_4_ARGS
2777 sub xSP, 20h
2778
2779 fninit
2780 fld tword [A2]
2781 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2782 %1 dword [A3]
2783
2784 fnstsw word [A1 + IEMFPURESULT.FSW]
2785 fnclex
2786 fstp tword [A1 + IEMFPURESULT.r80Result]
2787
2788 fninit
2789 add xSP, 20h
2790 EPILOGUE_4_ARGS
2791ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2792%endmacro
2793
2794IEMIMPL_FPU_R80_BY_I32 fiadd
2795IEMIMPL_FPU_R80_BY_I32 fimul
2796IEMIMPL_FPU_R80_BY_I32 fisub
2797IEMIMPL_FPU_R80_BY_I32 fisubr
2798IEMIMPL_FPU_R80_BY_I32 fidiv
2799IEMIMPL_FPU_R80_BY_I32 fidivr
2800
2801
2802;;
2803; FPU instruction working on one 80-bit and one 32-bit signed integer value,
2804; only returning FSW.
2805;
2806; @param 1 The instruction
2807;
2808; @param A0 FPU context (fxsave).
2809; @param A1 Where to store the output FSW.
2810; @param A2 Pointer to the 80-bit value.
2811; @param A3 Pointer to the 64-bit value.
2812;
2813%macro IEMIMPL_FPU_R80_BY_I32_FSW 1
2814BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_i32, 16
2815 PROLOGUE_4_ARGS
2816 sub xSP, 20h
2817
2818 fninit
2819 fld tword [A2]
2820 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2821 %1 dword [A3]
2822
2823 fnstsw word [A1]
2824
2825 fninit
2826 add xSP, 20h
2827 EPILOGUE_4_ARGS
2828ENDPROC iemAImpl_ %+ %1 %+ _r80_by_i32
2829%endmacro
2830
2831IEMIMPL_FPU_R80_BY_I32_FSW ficom
2832
2833
2834
2835;
2836;---------------------- 64-bit signed integer operations ----------------------
2837;
2838
2839
2840;;
2841; Converts a 64-bit floating point value to a 80-bit one (fpu register).
2842;
2843; @param A0 FPU context (fxsave).
2844; @param A1 Pointer to a IEMFPURESULT for the output.
2845; @param A2 Pointer to the 64-bit floating point value to convert.
2846;
2847BEGINPROC_FASTCALL iemAImpl_fild_r80_from_i64, 12
2848 PROLOGUE_3_ARGS
2849 sub xSP, 20h
2850
2851 fninit
2852 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2853 fild qword [A2]
2854
2855 fnstsw word [A1 + IEMFPURESULT.FSW]
2856 fnclex
2857 fstp tword [A1 + IEMFPURESULT.r80Result]
2858
2859 fninit
2860 add xSP, 20h
2861 EPILOGUE_3_ARGS
2862ENDPROC iemAImpl_fild_r80_from_i64
2863
2864
2865;;
2866; Store a 80-bit floating point value (register) as a 64-bit signed integer (memory).
2867;
2868; @param A0 FPU context (fxsave).
2869; @param A1 Where to return the output FSW.
2870; @param A2 Where to store the 64-bit signed integer value.
2871; @param A3 Pointer to the 80-bit value.
2872;
2873BEGINPROC_FASTCALL iemAImpl_fist_r80_to_i64, 16
2874 PROLOGUE_4_ARGS
2875 sub xSP, 20h
2876
2877 fninit
2878 fld tword [A3]
2879 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2880 fistp qword [A2]
2881
2882 fnstsw word [A1]
2883
2884 fninit
2885 add xSP, 20h
2886 EPILOGUE_4_ARGS
2887ENDPROC iemAImpl_fist_r80_to_i64
2888
2889
2890;;
2891; Store a 80-bit floating point value (register) as a 64-bit signed integer
2892; (memory) with truncation.
2893;
2894; @param A0 FPU context (fxsave).
2895; @param A1 Where to return the output FSW.
2896; @param A2 Where to store the 64-bit signed integer value.
2897; @param A3 Pointer to the 80-bit value.
2898;
2899BEGINPROC_FASTCALL iemAImpl_fistt_r80_to_i64, 16
2900 PROLOGUE_4_ARGS
2901 sub xSP, 20h
2902
2903 fninit
2904 fld tword [A3]
2905 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2906 fisttp qword [A2]
2907
2908 fnstsw word [A1]
2909
2910 fninit
2911 add xSP, 20h
2912 EPILOGUE_4_ARGS
2913ENDPROC iemAImpl_fistt_r80_to_i64
2914
2915
2916
2917;
2918;---------------------- 32-bit floating point operations ----------------------
2919;
2920
2921;;
2922; Converts a 32-bit floating point value to a 80-bit one (fpu register).
2923;
2924; @param A0 FPU context (fxsave).
2925; @param A1 Pointer to a IEMFPURESULT for the output.
2926; @param A2 Pointer to the 32-bit floating point value to convert.
2927;
2928BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r32, 12
2929 PROLOGUE_3_ARGS
2930 sub xSP, 20h
2931
2932 fninit
2933 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2934 fld dword [A2]
2935
2936 fnstsw word [A1 + IEMFPURESULT.FSW]
2937 fnclex
2938 fstp tword [A1 + IEMFPURESULT.r80Result]
2939
2940 fninit
2941 add xSP, 20h
2942 EPILOGUE_3_ARGS
2943ENDPROC iemAImpl_fld_r80_from_r32
2944
2945
2946;;
2947; Store a 80-bit floating point value (register) as a 32-bit one (memory).
2948;
2949; @param A0 FPU context (fxsave).
2950; @param A1 Where to return the output FSW.
2951; @param A2 Where to store the 32-bit value.
2952; @param A3 Pointer to the 80-bit value.
2953;
2954BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r32, 16
2955 PROLOGUE_4_ARGS
2956 sub xSP, 20h
2957
2958 fninit
2959 fld tword [A3]
2960 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2961 fst dword [A2]
2962
2963 fnstsw word [A1]
2964
2965 fninit
2966 add xSP, 20h
2967 EPILOGUE_4_ARGS
2968ENDPROC iemAImpl_fst_r80_to_r32
2969
2970
2971;;
2972; FPU instruction working on one 80-bit and one 32-bit floating point value.
2973;
2974; @param 1 The instruction
2975;
2976; @param A0 FPU context (fxsave).
2977; @param A1 Pointer to a IEMFPURESULT for the output.
2978; @param A2 Pointer to the 80-bit value.
2979; @param A3 Pointer to the 32-bit value.
2980;
2981%macro IEMIMPL_FPU_R80_BY_R32 1
2982BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
2983 PROLOGUE_4_ARGS
2984 sub xSP, 20h
2985
2986 fninit
2987 fld tword [A2]
2988 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
2989 %1 dword [A3]
2990
2991 fnstsw word [A1 + IEMFPURESULT.FSW]
2992 fnclex
2993 fstp tword [A1 + IEMFPURESULT.r80Result]
2994
2995 fninit
2996 add xSP, 20h
2997 EPILOGUE_4_ARGS
2998ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
2999%endmacro
3000
3001IEMIMPL_FPU_R80_BY_R32 fadd
3002IEMIMPL_FPU_R80_BY_R32 fmul
3003IEMIMPL_FPU_R80_BY_R32 fsub
3004IEMIMPL_FPU_R80_BY_R32 fsubr
3005IEMIMPL_FPU_R80_BY_R32 fdiv
3006IEMIMPL_FPU_R80_BY_R32 fdivr
3007
3008
3009;;
3010; FPU instruction working on one 80-bit and one 32-bit floating point value,
3011; only returning FSW.
3012;
3013; @param 1 The instruction
3014;
3015; @param A0 FPU context (fxsave).
3016; @param A1 Where to store the output FSW.
3017; @param A2 Pointer to the 80-bit value.
3018; @param A3 Pointer to the 64-bit value.
3019;
3020%macro IEMIMPL_FPU_R80_BY_R32_FSW 1
3021BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r32, 16
3022 PROLOGUE_4_ARGS
3023 sub xSP, 20h
3024
3025 fninit
3026 fld tword [A2]
3027 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3028 %1 dword [A3]
3029
3030 fnstsw word [A1]
3031
3032 fninit
3033 add xSP, 20h
3034 EPILOGUE_4_ARGS
3035ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r32
3036%endmacro
3037
3038IEMIMPL_FPU_R80_BY_R32_FSW fcom
3039
3040
3041
3042;
3043;---------------------- 64-bit floating point operations ----------------------
3044;
3045
3046;;
3047; Converts a 64-bit floating point value to a 80-bit one (fpu register).
3048;
3049; @param A0 FPU context (fxsave).
3050; @param A1 Pointer to a IEMFPURESULT for the output.
3051; @param A2 Pointer to the 64-bit floating point value to convert.
3052;
3053BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r64, 12
3054 PROLOGUE_3_ARGS
3055 sub xSP, 20h
3056
3057 fninit
3058 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3059 fld qword [A2]
3060
3061 fnstsw word [A1 + IEMFPURESULT.FSW]
3062 fnclex
3063 fstp tword [A1 + IEMFPURESULT.r80Result]
3064
3065 fninit
3066 add xSP, 20h
3067 EPILOGUE_3_ARGS
3068ENDPROC iemAImpl_fld_r80_from_r64
3069
3070
3071;;
3072; Store a 80-bit floating point value (register) as a 64-bit one (memory).
3073;
3074; @param A0 FPU context (fxsave).
3075; @param A1 Where to return the output FSW.
3076; @param A2 Where to store the 64-bit value.
3077; @param A3 Pointer to the 80-bit value.
3078;
3079BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r64, 16
3080 PROLOGUE_4_ARGS
3081 sub xSP, 20h
3082
3083 fninit
3084 fld tword [A3]
3085 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3086 fst qword [A2]
3087
3088 fnstsw word [A1]
3089
3090 fninit
3091 add xSP, 20h
3092 EPILOGUE_4_ARGS
3093ENDPROC iemAImpl_fst_r80_to_r64
3094
3095
3096;;
3097; FPU instruction working on one 80-bit and one 64-bit floating point value.
3098;
3099; @param 1 The instruction
3100;
3101; @param A0 FPU context (fxsave).
3102; @param A1 Pointer to a IEMFPURESULT for the output.
3103; @param A2 Pointer to the 80-bit value.
3104; @param A3 Pointer to the 64-bit value.
3105;
3106%macro IEMIMPL_FPU_R80_BY_R64 1
3107BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3108 PROLOGUE_4_ARGS
3109 sub xSP, 20h
3110
3111 fninit
3112 fld tword [A2]
3113 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3114 %1 qword [A3]
3115
3116 fnstsw word [A1 + IEMFPURESULT.FSW]
3117 fnclex
3118 fstp tword [A1 + IEMFPURESULT.r80Result]
3119
3120 fninit
3121 add xSP, 20h
3122 EPILOGUE_4_ARGS
3123ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3124%endmacro
3125
3126IEMIMPL_FPU_R80_BY_R64 fadd
3127IEMIMPL_FPU_R80_BY_R64 fmul
3128IEMIMPL_FPU_R80_BY_R64 fsub
3129IEMIMPL_FPU_R80_BY_R64 fsubr
3130IEMIMPL_FPU_R80_BY_R64 fdiv
3131IEMIMPL_FPU_R80_BY_R64 fdivr
3132
3133;;
3134; FPU instruction working on one 80-bit and one 64-bit floating point value,
3135; only returning FSW.
3136;
3137; @param 1 The instruction
3138;
3139; @param A0 FPU context (fxsave).
3140; @param A1 Where to store the output FSW.
3141; @param A2 Pointer to the 80-bit value.
3142; @param A3 Pointer to the 64-bit value.
3143;
3144%macro IEMIMPL_FPU_R80_BY_R64_FSW 1
3145BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r64, 16
3146 PROLOGUE_4_ARGS
3147 sub xSP, 20h
3148
3149 fninit
3150 fld tword [A2]
3151 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3152 %1 qword [A3]
3153
3154 fnstsw word [A1]
3155
3156 fninit
3157 add xSP, 20h
3158 EPILOGUE_4_ARGS
3159ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r64
3160%endmacro
3161
3162IEMIMPL_FPU_R80_BY_R64_FSW fcom
3163
3164
3165
3166;
3167;---------------------- 80-bit floating point operations ----------------------
3168;
3169
3170;;
3171; Loads a 80-bit floating point register value from memory.
3172;
3173; @param A0 FPU context (fxsave).
3174; @param A1 Pointer to a IEMFPURESULT for the output.
3175; @param A2 Pointer to the 80-bit floating point value to load.
3176;
3177BEGINPROC_FASTCALL iemAImpl_fld_r80_from_r80, 12
3178 PROLOGUE_3_ARGS
3179 sub xSP, 20h
3180
3181 fninit
3182 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3183 fld tword [A2]
3184
3185 fnstsw word [A1 + IEMFPURESULT.FSW]
3186 fnclex
3187 fstp tword [A1 + IEMFPURESULT.r80Result]
3188
3189 fninit
3190 add xSP, 20h
3191 EPILOGUE_3_ARGS
3192ENDPROC iemAImpl_fld_r80_from_r80
3193
3194
3195;;
3196; Store a 80-bit floating point register to memory
3197;
3198; @param A0 FPU context (fxsave).
3199; @param A1 Where to return the output FSW.
3200; @param A2 Where to store the 80-bit value.
3201; @param A3 Pointer to the 80-bit register value.
3202;
3203BEGINPROC_FASTCALL iemAImpl_fst_r80_to_r80, 16
3204 PROLOGUE_4_ARGS
3205 sub xSP, 20h
3206
3207 fninit
3208 fld tword [A3]
3209 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3210 fstp tword [A2]
3211
3212 fnstsw word [A1]
3213
3214 fninit
3215 add xSP, 20h
3216 EPILOGUE_4_ARGS
3217ENDPROC iemAImpl_fst_r80_to_r80
3218
3219
3220;;
3221; Loads an 80-bit floating point register value in BCD format from memory.
3222;
3223; @param A0 FPU context (fxsave).
3224; @param A1 Pointer to a IEMFPURESULT for the output.
3225; @param A2 Pointer to the 80-bit BCD value to load.
3226;
3227BEGINPROC_FASTCALL iemAImpl_fld_r80_from_d80, 12
3228 PROLOGUE_3_ARGS
3229 sub xSP, 20h
3230
3231 fninit
3232 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3233 fbld tword [A2]
3234
3235 fnstsw word [A1 + IEMFPURESULT.FSW]
3236 fnclex
3237 fstp tword [A1 + IEMFPURESULT.r80Result]
3238
3239 fninit
3240 add xSP, 20h
3241 EPILOGUE_3_ARGS
3242ENDPROC iemAImpl_fld_r80_from_d80
3243
3244
3245;;
3246; Store a 80-bit floating point register to memory as BCD
3247;
3248; @param A0 FPU context (fxsave).
3249; @param A1 Where to return the output FSW.
3250; @param A2 Where to store the 80-bit BCD value.
3251; @param A3 Pointer to the 80-bit register value.
3252;
3253BEGINPROC_FASTCALL iemAImpl_fst_r80_to_d80, 16
3254 PROLOGUE_4_ARGS
3255 sub xSP, 20h
3256
3257 fninit
3258 fld tword [A3]
3259 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3260 fbstp tword [A2]
3261
3262 fnstsw word [A1]
3263
3264 fninit
3265 add xSP, 20h
3266 EPILOGUE_4_ARGS
3267ENDPROC iemAImpl_fst_r80_to_d80
3268
3269
3270;;
3271; FPU instruction working on two 80-bit floating point values.
3272;
3273; @param 1 The instruction
3274;
3275; @param A0 FPU context (fxsave).
3276; @param A1 Pointer to a IEMFPURESULT for the output.
3277; @param A2 Pointer to the first 80-bit value (ST0)
3278; @param A3 Pointer to the second 80-bit value (STn).
3279;
3280%macro IEMIMPL_FPU_R80_BY_R80 2
3281BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3282 PROLOGUE_4_ARGS
3283 sub xSP, 20h
3284
3285 fninit
3286 fld tword [A3]
3287 fld tword [A2]
3288 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3289 %1 %2
3290
3291 fnstsw word [A1 + IEMFPURESULT.FSW]
3292 fnclex
3293 fstp tword [A1 + IEMFPURESULT.r80Result]
3294
3295 fninit
3296 add xSP, 20h
3297 EPILOGUE_4_ARGS
3298ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3299%endmacro
3300
3301IEMIMPL_FPU_R80_BY_R80 fadd, {st0, st1}
3302IEMIMPL_FPU_R80_BY_R80 fmul, {st0, st1}
3303IEMIMPL_FPU_R80_BY_R80 fsub, {st0, st1}
3304IEMIMPL_FPU_R80_BY_R80 fsubr, {st0, st1}
3305IEMIMPL_FPU_R80_BY_R80 fdiv, {st0, st1}
3306IEMIMPL_FPU_R80_BY_R80 fdivr, {st0, st1}
3307IEMIMPL_FPU_R80_BY_R80 fprem, {}
3308IEMIMPL_FPU_R80_BY_R80 fprem1, {}
3309IEMIMPL_FPU_R80_BY_R80 fscale, {}
3310
3311
3312;;
3313; FPU instruction working on two 80-bit floating point values, ST1 and ST0,
3314; storing the result in ST1 and popping the stack.
3315;
3316; @param 1 The instruction
3317;
3318; @param A0 FPU context (fxsave).
3319; @param A1 Pointer to a IEMFPURESULT for the output.
3320; @param A2 Pointer to the first 80-bit value (ST1).
3321; @param A3 Pointer to the second 80-bit value (ST0).
3322;
3323%macro IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP 1
3324BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3325 PROLOGUE_4_ARGS
3326 sub xSP, 20h
3327
3328 fninit
3329 fld tword [A2]
3330 fld tword [A3]
3331 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3332 %1
3333
3334 fnstsw word [A1 + IEMFPURESULT.FSW]
3335 fnclex
3336 fstp tword [A1 + IEMFPURESULT.r80Result]
3337
3338 fninit
3339 add xSP, 20h
3340 EPILOGUE_4_ARGS
3341ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3342%endmacro
3343
3344IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fpatan
3345IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2x
3346IEMIMPL_FPU_R80_BY_R80_ST1_ST0_POP fyl2xp1
3347
3348
3349;;
3350; FPU instruction working on two 80-bit floating point values, only
3351; returning FSW.
3352;
3353; @param 1 The instruction
3354;
3355; @param A0 FPU context (fxsave).
3356; @param A1 Pointer to a uint16_t for the resulting FSW.
3357; @param A2 Pointer to the first 80-bit value.
3358; @param A3 Pointer to the second 80-bit value.
3359;
3360%macro IEMIMPL_FPU_R80_BY_R80_FSW 1
3361BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3362 PROLOGUE_4_ARGS
3363 sub xSP, 20h
3364
3365 fninit
3366 fld tword [A3]
3367 fld tword [A2]
3368 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3369 %1 st0, st1
3370
3371 fnstsw word [A1]
3372
3373 fninit
3374 add xSP, 20h
3375 EPILOGUE_4_ARGS
3376ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3377%endmacro
3378
3379IEMIMPL_FPU_R80_BY_R80_FSW fcom
3380IEMIMPL_FPU_R80_BY_R80_FSW fucom
3381
3382
3383;;
3384; FPU instruction working on two 80-bit floating point values,
3385; returning FSW and EFLAGS (eax).
3386;
3387; @param 1 The instruction
3388;
3389; @returns EFLAGS in EAX.
3390; @param A0 FPU context (fxsave).
3391; @param A1 Pointer to a uint16_t for the resulting FSW.
3392; @param A2 Pointer to the first 80-bit value.
3393; @param A3 Pointer to the second 80-bit value.
3394;
3395%macro IEMIMPL_FPU_R80_BY_R80_EFL 1
3396BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_by_r80, 16
3397 PROLOGUE_4_ARGS
3398 sub xSP, 20h
3399
3400 fninit
3401 fld tword [A3]
3402 fld tword [A2]
3403 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3404 %1 st1
3405
3406 fnstsw word [A1]
3407 pushf
3408 pop xAX
3409
3410 fninit
3411 add xSP, 20h
3412 EPILOGUE_4_ARGS
3413ENDPROC iemAImpl_ %+ %1 %+ _r80_by_r80
3414%endmacro
3415
3416IEMIMPL_FPU_R80_BY_R80_EFL fcomi
3417IEMIMPL_FPU_R80_BY_R80_EFL fucomi
3418
3419
3420;;
3421; FPU instruction working on one 80-bit floating point value.
3422;
3423; @param 1 The instruction
3424;
3425; @param A0 FPU context (fxsave).
3426; @param A1 Pointer to a IEMFPURESULT for the output.
3427; @param A2 Pointer to the 80-bit value.
3428;
3429%macro IEMIMPL_FPU_R80 1
3430BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3431 PROLOGUE_3_ARGS
3432 sub xSP, 20h
3433
3434 fninit
3435 fld tword [A2]
3436 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3437 %1
3438
3439 fnstsw word [A1 + IEMFPURESULT.FSW]
3440 fnclex
3441 fstp tword [A1 + IEMFPURESULT.r80Result]
3442
3443 fninit
3444 add xSP, 20h
3445 EPILOGUE_3_ARGS
3446ENDPROC iemAImpl_ %+ %1 %+ _r80
3447%endmacro
3448
3449IEMIMPL_FPU_R80 fchs
3450IEMIMPL_FPU_R80 fabs
3451IEMIMPL_FPU_R80 f2xm1
3452IEMIMPL_FPU_R80 fsqrt
3453IEMIMPL_FPU_R80 frndint
3454IEMIMPL_FPU_R80 fsin
3455IEMIMPL_FPU_R80 fcos
3456
3457
3458;;
3459; FPU instruction working on one 80-bit floating point value, only
3460; returning FSW.
3461;
3462; @param 1 The instruction
3463; @param 2 Non-zero to also restore FTW.
3464;
3465; @param A0 FPU context (fxsave).
3466; @param A1 Pointer to a uint16_t for the resulting FSW.
3467; @param A2 Pointer to the 80-bit value.
3468;
3469%macro IEMIMPL_FPU_R80_FSW 2
3470BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80, 12
3471 PROLOGUE_3_ARGS
3472 sub xSP, 20h
3473
3474 fninit
3475 fld tword [A2]
3476%if %2 != 0
3477 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW_AND_FTW_0 A0
3478%else
3479 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3480%endif
3481 %1
3482
3483 fnstsw word [A1]
3484
3485 fninit
3486 add xSP, 20h
3487 EPILOGUE_3_ARGS
3488ENDPROC iemAImpl_ %+ %1 %+ _r80
3489%endmacro
3490
3491IEMIMPL_FPU_R80_FSW ftst, 0
3492IEMIMPL_FPU_R80_FSW fxam, 1 ; No #IS or any other FP exceptions.
3493
3494
3495
3496;;
3497; FPU instruction loading a 80-bit floating point constant.
3498;
3499; @param 1 The instruction
3500;
3501; @param A0 FPU context (fxsave).
3502; @param A1 Pointer to a IEMFPURESULT for the output.
3503;
3504%macro IEMIMPL_FPU_R80_CONST 1
3505BEGINPROC_FASTCALL iemAImpl_ %+ %1, 8
3506 PROLOGUE_2_ARGS
3507 sub xSP, 20h
3508
3509 fninit
3510 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3511 %1
3512
3513 fnstsw word [A1 + IEMFPURESULT.FSW]
3514 fnclex
3515 fstp tword [A1 + IEMFPURESULT.r80Result]
3516
3517 fninit
3518 add xSP, 20h
3519 EPILOGUE_2_ARGS
3520ENDPROC iemAImpl_ %+ %1 %+
3521%endmacro
3522
3523IEMIMPL_FPU_R80_CONST fld1
3524IEMIMPL_FPU_R80_CONST fldl2t
3525IEMIMPL_FPU_R80_CONST fldl2e
3526IEMIMPL_FPU_R80_CONST fldpi
3527IEMIMPL_FPU_R80_CONST fldlg2
3528IEMIMPL_FPU_R80_CONST fldln2
3529IEMIMPL_FPU_R80_CONST fldz
3530
3531
3532;;
3533; FPU instruction working on one 80-bit floating point value, outputing two.
3534;
3535; @param 1 The instruction
3536;
3537; @param A0 FPU context (fxsave).
3538; @param A1 Pointer to a IEMFPURESULTTWO for the output.
3539; @param A2 Pointer to the 80-bit value.
3540;
3541%macro IEMIMPL_FPU_R80_R80 1
3542BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _r80_r80, 12
3543 PROLOGUE_3_ARGS
3544 sub xSP, 20h
3545
3546 fninit
3547 fld tword [A2]
3548 FPU_LD_FXSTATE_FCW_AND_SAFE_FSW A0
3549 %1
3550
3551 fnstsw word [A1 + IEMFPURESULTTWO.FSW]
3552 fnclex
3553 fstp tword [A1 + IEMFPURESULTTWO.r80Result2]
3554 fnclex
3555 fstp tword [A1 + IEMFPURESULTTWO.r80Result1]
3556
3557 fninit
3558 add xSP, 20h
3559 EPILOGUE_3_ARGS
3560ENDPROC iemAImpl_ %+ %1 %+ _r80_r80
3561%endmacro
3562
3563IEMIMPL_FPU_R80_R80 fptan
3564IEMIMPL_FPU_R80_R80 fxtract
3565IEMIMPL_FPU_R80_R80 fsincos
3566
3567
3568
3569
3570;---------------------- SSE and MMX Operations ----------------------
3571
3572;; @todo what do we need to do for MMX?
3573%macro IEMIMPL_MMX_PROLOGUE 0
3574%endmacro
3575%macro IEMIMPL_MMX_EPILOGUE 0
3576%endmacro
3577
3578;; @todo what do we need to do for SSE?
3579%macro IEMIMPL_SSE_PROLOGUE 0
3580%endmacro
3581%macro IEMIMPL_SSE_EPILOGUE 0
3582%endmacro
3583
3584;; @todo what do we need to do for AVX?
3585%macro IEMIMPL_AVX_PROLOGUE 0
3586%endmacro
3587%macro IEMIMPL_AVX_EPILOGUE 0
3588%endmacro
3589
3590
3591;;
3592; Media instruction working on two full sized registers.
3593;
3594; @param 1 The instruction
3595; @param 2 Whether there is an MMX variant (1) or not (0).
3596;
3597; @param A0 FPU context (fxsave).
3598; @param A1 Pointer to the first media register size operand (input/output).
3599; @param A2 Pointer to the second media register size operand (input).
3600;
3601%macro IEMIMPL_MEDIA_F2 2
3602%if %2 != 0
3603BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 12
3604 PROLOGUE_3_ARGS
3605 IEMIMPL_MMX_PROLOGUE
3606
3607 movq mm0, [A1]
3608 movq mm1, [A2]
3609 %1 mm0, mm1
3610 movq [A1], mm0
3611
3612 IEMIMPL_MMX_EPILOGUE
3613 EPILOGUE_3_ARGS
3614ENDPROC iemAImpl_ %+ %1 %+ _u64
3615%endif
3616
3617BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3618 PROLOGUE_3_ARGS
3619 IEMIMPL_SSE_PROLOGUE
3620
3621 movdqu xmm0, [A1]
3622 movdqu xmm1, [A2]
3623 %1 xmm0, xmm1
3624 movdqu [A1], xmm0
3625
3626 IEMIMPL_SSE_EPILOGUE
3627 EPILOGUE_3_ARGS
3628ENDPROC iemAImpl_ %+ %1 %+ _u128
3629%endmacro
3630
3631IEMIMPL_MEDIA_F2 pshufb, 1
3632IEMIMPL_MEDIA_F2 pand, 1
3633IEMIMPL_MEDIA_F2 pandn, 1
3634IEMIMPL_MEDIA_F2 por, 1
3635IEMIMPL_MEDIA_F2 pxor, 1
3636IEMIMPL_MEDIA_F2 pcmpeqb, 1
3637IEMIMPL_MEDIA_F2 pcmpeqw, 1
3638IEMIMPL_MEDIA_F2 pcmpeqd, 1
3639IEMIMPL_MEDIA_F2 pcmpeqq, 0
3640IEMIMPL_MEDIA_F2 pcmpgtb, 1
3641IEMIMPL_MEDIA_F2 pcmpgtw, 1
3642IEMIMPL_MEDIA_F2 pcmpgtd, 1
3643IEMIMPL_MEDIA_F2 pcmpgtq, 0
3644IEMIMPL_MEDIA_F2 paddb, 1
3645IEMIMPL_MEDIA_F2 paddw, 1
3646IEMIMPL_MEDIA_F2 paddd, 1
3647IEMIMPL_MEDIA_F2 paddq, 1
3648IEMIMPL_MEDIA_F2 paddsb, 1
3649IEMIMPL_MEDIA_F2 paddsw, 1
3650IEMIMPL_MEDIA_F2 paddusb, 1
3651IEMIMPL_MEDIA_F2 paddusw, 1
3652IEMIMPL_MEDIA_F2 psubb, 1
3653IEMIMPL_MEDIA_F2 psubw, 1
3654IEMIMPL_MEDIA_F2 psubd, 1
3655IEMIMPL_MEDIA_F2 psubq, 1
3656IEMIMPL_MEDIA_F2 psubsb, 1
3657IEMIMPL_MEDIA_F2 psubsw, 1
3658IEMIMPL_MEDIA_F2 psubusb, 1
3659IEMIMPL_MEDIA_F2 psubusw, 1
3660IEMIMPL_MEDIA_F2 pmullw, 1
3661IEMIMPL_MEDIA_F2 pmulld, 0
3662IEMIMPL_MEDIA_F2 pmulhw, 1
3663IEMIMPL_MEDIA_F2 pmaddwd, 1
3664IEMIMPL_MEDIA_F2 pminub, 1
3665IEMIMPL_MEDIA_F2 pminuw, 0
3666IEMIMPL_MEDIA_F2 pminud, 0
3667IEMIMPL_MEDIA_F2 pminsb, 0
3668IEMIMPL_MEDIA_F2 pminsw, 1
3669IEMIMPL_MEDIA_F2 pminsd, 0
3670IEMIMPL_MEDIA_F2 pmaxub, 1
3671IEMIMPL_MEDIA_F2 pmaxuw, 0
3672IEMIMPL_MEDIA_F2 pmaxud, 0
3673IEMIMPL_MEDIA_F2 pmaxsb, 0
3674IEMIMPL_MEDIA_F2 pmaxsw, 1
3675IEMIMPL_MEDIA_F2 pmaxsd, 0
3676IEMIMPL_MEDIA_F2 pabsb, 1
3677IEMIMPL_MEDIA_F2 pabsw, 1
3678IEMIMPL_MEDIA_F2 pabsd, 1
3679IEMIMPL_MEDIA_F2 psignb, 1
3680IEMIMPL_MEDIA_F2 psignw, 1
3681IEMIMPL_MEDIA_F2 psignd, 1
3682IEMIMPL_MEDIA_F2 phaddw, 1
3683IEMIMPL_MEDIA_F2 phaddd, 1
3684IEMIMPL_MEDIA_F2 phsubw, 1
3685IEMIMPL_MEDIA_F2 phsubd, 1
3686IEMIMPL_MEDIA_F2 phaddsw, 1
3687IEMIMPL_MEDIA_F2 phsubsw, 1
3688IEMIMPL_MEDIA_F2 pmaddubsw, 1
3689IEMIMPL_MEDIA_F2 pmulhrsw, 1
3690IEMIMPL_MEDIA_F2 pmuludq, 1
3691
3692
3693;;
3694; Media instruction working on two full sized registers, but no FXSAVE state argument.
3695;
3696; @param 1 The instruction
3697; @param 2 Whether there is an MMX variant (1) or not (0).
3698;
3699; @param A0 Pointer to the first media register size operand (input/output).
3700; @param A1 Pointer to the second media register size operand (input).
3701;
3702%macro IEMIMPL_MEDIA_OPT_F2 2
3703%if %2 != 0
3704BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3705 PROLOGUE_2_ARGS
3706 IEMIMPL_MMX_PROLOGUE
3707
3708 movq mm0, [A0]
3709 movq mm1, [A1]
3710 %1 mm0, mm1
3711 movq [A0], mm0
3712
3713 IEMIMPL_MMX_EPILOGUE
3714 EPILOGUE_2_ARGS
3715ENDPROC iemAImpl_ %+ %1 %+ _u64
3716%endif
3717
3718BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3719 PROLOGUE_2_ARGS
3720 IEMIMPL_SSE_PROLOGUE
3721
3722 movdqu xmm0, [A0]
3723 movdqu xmm1, [A1]
3724 %1 xmm0, xmm1
3725 movdqu [A0], xmm0
3726
3727 IEMIMPL_SSE_EPILOGUE
3728 EPILOGUE_2_ARGS
3729ENDPROC iemAImpl_ %+ %1 %+ _u128
3730%endmacro
3731
3732IEMIMPL_MEDIA_OPT_F2 packsswb, 1
3733IEMIMPL_MEDIA_OPT_F2 packssdw, 1
3734IEMIMPL_MEDIA_OPT_F2 packuswb, 1
3735IEMIMPL_MEDIA_OPT_F2 packusdw, 0
3736IEMIMPL_MEDIA_OPT_F2 psllw, 1
3737IEMIMPL_MEDIA_OPT_F2 pslld, 1
3738IEMIMPL_MEDIA_OPT_F2 psllq, 1
3739IEMIMPL_MEDIA_OPT_F2 psrlw, 1
3740IEMIMPL_MEDIA_OPT_F2 psrld, 1
3741IEMIMPL_MEDIA_OPT_F2 psrlq, 1
3742IEMIMPL_MEDIA_OPT_F2 psraw, 1
3743IEMIMPL_MEDIA_OPT_F2 psrad, 1
3744IEMIMPL_MEDIA_OPT_F2 pmulhuw, 1
3745IEMIMPL_MEDIA_OPT_F2 pavgb, 1
3746IEMIMPL_MEDIA_OPT_F2 pavgw, 1
3747IEMIMPL_MEDIA_OPT_F2 psadbw, 1
3748IEMIMPL_MEDIA_OPT_F2 pmuldq, 0
3749IEMIMPL_MEDIA_OPT_F2 unpcklps, 0
3750IEMIMPL_MEDIA_OPT_F2 unpcklpd, 0
3751IEMIMPL_MEDIA_OPT_F2 unpckhps, 0
3752IEMIMPL_MEDIA_OPT_F2 unpckhpd, 0
3753IEMIMPL_MEDIA_OPT_F2 phminposuw, 0
3754IEMIMPL_MEDIA_OPT_F2 aesimc, 0
3755IEMIMPL_MEDIA_OPT_F2 aesenc, 0
3756IEMIMPL_MEDIA_OPT_F2 aesdec, 0
3757IEMIMPL_MEDIA_OPT_F2 aesenclast, 0
3758IEMIMPL_MEDIA_OPT_F2 aesdeclast, 0
3759IEMIMPL_MEDIA_OPT_F2 sha1nexte, 0
3760IEMIMPL_MEDIA_OPT_F2 sha1msg1, 0
3761IEMIMPL_MEDIA_OPT_F2 sha1msg2, 0
3762IEMIMPL_MEDIA_OPT_F2 sha256msg1, 0
3763IEMIMPL_MEDIA_OPT_F2 sha256msg2, 0
3764
3765;;
3766; Media instruction working on one full sized and one half sized register (lower half).
3767;
3768; @param 1 The instruction
3769; @param 2 1 if MMX is included, 0 if not.
3770;
3771; @param A0 Pointer to the first full sized media register operand (input/output).
3772; @param A1 Pointer to the second half sized media register operand (input).
3773;
3774%macro IEMIMPL_MEDIA_F1L1 2
3775 %if %2 != 0
3776BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
3777 PROLOGUE_2_ARGS
3778 IEMIMPL_MMX_PROLOGUE
3779
3780 movq mm0, [A0]
3781 movq mm1, [A1]
3782 %1 mm0, mm1
3783 movq [A0], mm0
3784
3785 IEMIMPL_MMX_EPILOGUE
3786 EPILOGUE_2_ARGS
3787ENDPROC iemAImpl_ %+ %1 %+ _u64
3788 %endif
3789
3790BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 8
3791 PROLOGUE_2_ARGS
3792 IEMIMPL_SSE_PROLOGUE
3793
3794 movdqu xmm0, [A0]
3795 movdqu xmm1, [A1]
3796 %1 xmm0, xmm1
3797 movdqu [A0], xmm0
3798
3799 IEMIMPL_SSE_EPILOGUE
3800 EPILOGUE_2_ARGS
3801ENDPROC iemAImpl_ %+ %1 %+ _u128
3802%endmacro
3803
3804IEMIMPL_MEDIA_F1L1 punpcklbw, 1
3805IEMIMPL_MEDIA_F1L1 punpcklwd, 1
3806IEMIMPL_MEDIA_F1L1 punpckldq, 1
3807IEMIMPL_MEDIA_F1L1 punpcklqdq, 0
3808
3809
3810;;
3811; Media instruction working two half sized input registers (lower half) and a full sized
3812; destination register (vpunpckh*).
3813;
3814; @param 1 The instruction
3815;
3816; @param A0 Pointer to the destination register (full sized, output only).
3817; @param A1 Pointer to the first full sized media source register operand, where we
3818; will only use the lower half as input - but we'll be loading it in full.
3819; @param A2 Pointer to the second full sized media source register operand, where we
3820; will only use the lower half as input - but we'll be loading it in full.
3821;
3822%macro IEMIMPL_MEDIA_F1L1L1 1
3823BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
3824 PROLOGUE_3_ARGS
3825 IEMIMPL_AVX_PROLOGUE
3826
3827 vmovdqu xmm0, [A1]
3828 vmovdqu xmm1, [A2]
3829 %1 xmm0, xmm0, xmm1
3830 vmovdqu [A0], xmm0
3831
3832 IEMIMPL_AVX_PROLOGUE
3833 EPILOGUE_3_ARGS
3834ENDPROC iemAImpl_ %+ %1 %+ _u128
3835
3836BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
3837 PROLOGUE_3_ARGS
3838 IEMIMPL_AVX_PROLOGUE
3839
3840 vmovdqu ymm0, [A1]
3841 vmovdqu ymm1, [A2]
3842 %1 ymm0, ymm0, ymm1
3843 vmovdqu [A0], ymm0
3844
3845 IEMIMPL_AVX_PROLOGUE
3846 EPILOGUE_3_ARGS
3847ENDPROC iemAImpl_ %+ %1 %+ _u256
3848%endmacro
3849
3850IEMIMPL_MEDIA_F1L1L1 vpunpcklbw
3851IEMIMPL_MEDIA_F1L1L1 vpunpcklwd
3852IEMIMPL_MEDIA_F1L1L1 vpunpckldq
3853IEMIMPL_MEDIA_F1L1L1 vpunpcklqdq
3854
3855
3856;;
3857; Media instruction working on one full sized and one half sized register (high half).
3858;
3859; @param 1 The instruction
3860; @param 2 1 if MMX is included, 0 if not.
3861;
3862; @param A0 Pointer to the first full sized media register operand (input/output).
3863; @param A1 Pointer to the second full sized media register operand, where we
3864; will only use the upper half as input - but we'll load it in full.
3865;
3866%macro IEMIMPL_MEDIA_F1H1 2
3867IEMIMPL_MEDIA_F1L1 %1, %2
3868%endmacro
3869
3870IEMIMPL_MEDIA_F1L1 punpckhbw, 1
3871IEMIMPL_MEDIA_F1L1 punpckhwd, 1
3872IEMIMPL_MEDIA_F1L1 punpckhdq, 1
3873IEMIMPL_MEDIA_F1L1 punpckhqdq, 0
3874
3875
3876;;
3877; Media instruction working two half sized input registers (high half) and a full sized
3878; destination register (vpunpckh*).
3879;
3880; @param 1 The instruction
3881;
3882; @param A0 Pointer to the destination register (full sized, output only).
3883; @param A1 Pointer to the first full sized media source register operand, where we
3884; will only use the upper half as input - but we'll be loading it in full.
3885; @param A2 Pointer to the second full sized media source register operand, where we
3886; will only use the upper half as input - but we'll be loading it in full.
3887;
3888%macro IEMIMPL_MEDIA_F1H1H1 1
3889IEMIMPL_MEDIA_F1L1L1 %1
3890%endmacro
3891
3892IEMIMPL_MEDIA_F1H1H1 vpunpckhbw
3893IEMIMPL_MEDIA_F1H1H1 vpunpckhwd
3894IEMIMPL_MEDIA_F1H1H1 vpunpckhdq
3895IEMIMPL_MEDIA_F1H1H1 vpunpckhqdq
3896
3897
3898;
3899; Shufflers with evil 8-bit immediates.
3900;
3901
3902BEGINPROC_FASTCALL iemAImpl_pshufw_u64, 16
3903 PROLOGUE_3_ARGS
3904 IEMIMPL_MMX_PROLOGUE
3905
3906 movq mm1, [A1]
3907 movq mm0, mm0 ; paranoia!
3908 lea T0, [A2 + A2*4] ; sizeof(pshufw+ret) == 5
3909 lea T1, [.imm0 xWrtRIP]
3910 lea T1, [T1 + T0]
3911 call T1
3912 movq [A0], mm0
3913
3914 IEMIMPL_MMX_EPILOGUE
3915 EPILOGUE_3_ARGS
3916%assign bImm 0
3917%rep 256
3918.imm %+ bImm:
3919 pshufw mm0, mm1, bImm
3920 ret
3921 %assign bImm bImm + 1
3922%endrep
3923.immEnd: ; 256*5 == 0x500
3924dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3925dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3926ENDPROC iemAImpl_pshufw_u64
3927
3928
3929%macro IEMIMPL_MEDIA_SSE_PSHUFXX 1
3930BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
3931 PROLOGUE_3_ARGS
3932 IEMIMPL_SSE_PROLOGUE
3933
3934 movdqu xmm1, [A1]
3935 movdqu xmm0, xmm1 ; paranoia!
3936 lea T1, [.imm0 xWrtRIP]
3937 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3938 lea T1, [T1 + T0*2]
3939 call T1
3940 movdqu [A0], xmm0
3941
3942 IEMIMPL_SSE_EPILOGUE
3943 EPILOGUE_3_ARGS
3944 %assign bImm 0
3945 %rep 256
3946.imm %+ bImm:
3947 %1 xmm0, xmm1, bImm
3948 ret
3949 %assign bImm bImm + 1
3950 %endrep
3951.immEnd: ; 256*6 == 0x600
3952dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3953dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3954ENDPROC iemAImpl_ %+ %1 %+ _u128
3955%endmacro
3956
3957IEMIMPL_MEDIA_SSE_PSHUFXX pshufhw
3958IEMIMPL_MEDIA_SSE_PSHUFXX pshuflw
3959IEMIMPL_MEDIA_SSE_PSHUFXX pshufd
3960
3961
3962%macro IEMIMPL_MEDIA_AVX_VPSHUFXX 1
3963BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
3964 PROLOGUE_3_ARGS
3965 IEMIMPL_SSE_PROLOGUE
3966
3967 vmovdqu ymm1, [A1]
3968 vmovdqu ymm0, ymm1 ; paranoia!
3969 lea T1, [.imm0 xWrtRIP]
3970 lea T0, [A2 + A2*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
3971 lea T1, [T1 + T0*2]
3972 call T1
3973 vmovdqu [A0], ymm0
3974
3975 IEMIMPL_SSE_EPILOGUE
3976 EPILOGUE_3_ARGS
3977 %assign bImm 0
3978 %rep 256
3979.imm %+ bImm:
3980 %1 ymm0, ymm1, bImm
3981 ret
3982 %assign bImm bImm + 1
3983 %endrep
3984.immEnd: ; 256*6 == 0x600
3985dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
3986dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
3987ENDPROC iemAImpl_ %+ %1 %+ _u256
3988%endmacro
3989
3990IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufhw
3991IEMIMPL_MEDIA_AVX_VPSHUFXX vpshuflw
3992IEMIMPL_MEDIA_AVX_VPSHUFXX vpshufd
3993
3994
3995;
3996; Shifts with evil 8-bit immediates.
3997;
3998
3999%macro IEMIMPL_MEDIA_MMX_PSHIFTXX 1
4000BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u64, 16
4001 PROLOGUE_2_ARGS
4002 IEMIMPL_MMX_PROLOGUE
4003
4004 movq mm0, [A0]
4005 lea T0, [A1 + A1*4] ; sizeof(psXX+ret) == 5
4006 lea T1, [.imm0 xWrtRIP]
4007 lea T1, [T1 + T0]
4008 call T1
4009 movq [A0], mm0
4010
4011 IEMIMPL_MMX_EPILOGUE
4012 EPILOGUE_2_ARGS
4013%assign bImm 0
4014%rep 256
4015.imm %+ bImm:
4016 %1 mm0, bImm
4017 ret
4018 %assign bImm bImm + 1
4019%endrep
4020.immEnd: ; 256*5 == 0x500
4021dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4022dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4023ENDPROC iemAImpl_ %+ %1 %+ _imm_u64
4024%endmacro
4025
4026IEMIMPL_MEDIA_MMX_PSHIFTXX psllw
4027IEMIMPL_MEDIA_MMX_PSHIFTXX pslld
4028IEMIMPL_MEDIA_MMX_PSHIFTXX psllq
4029IEMIMPL_MEDIA_MMX_PSHIFTXX psrlw
4030IEMIMPL_MEDIA_MMX_PSHIFTXX psrld
4031IEMIMPL_MEDIA_MMX_PSHIFTXX psrlq
4032IEMIMPL_MEDIA_MMX_PSHIFTXX psraw
4033IEMIMPL_MEDIA_MMX_PSHIFTXX psrad
4034
4035
4036%macro IEMIMPL_MEDIA_SSE_PSHIFTXX 1
4037BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _imm_u128, 16
4038 PROLOGUE_2_ARGS
4039 IEMIMPL_SSE_PROLOGUE
4040
4041 movdqu xmm0, [A0]
4042 lea T1, [.imm0 xWrtRIP]
4043 lea T0, [A1 + A1*2] ; sizeof(psXX+ret) == 6: (A3 * 3) *2
4044 lea T1, [T1 + T0*2]
4045 call T1
4046 movdqu [A0], xmm0
4047
4048 IEMIMPL_SSE_EPILOGUE
4049 EPILOGUE_2_ARGS
4050 %assign bImm 0
4051 %rep 256
4052.imm %+ bImm:
4053 %1 xmm0, bImm
4054 ret
4055 %assign bImm bImm + 1
4056 %endrep
4057.immEnd: ; 256*6 == 0x600
4058dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4059dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4060ENDPROC iemAImpl_ %+ %1 %+ _imm_u128
4061%endmacro
4062
4063IEMIMPL_MEDIA_SSE_PSHIFTXX psllw
4064IEMIMPL_MEDIA_SSE_PSHIFTXX pslld
4065IEMIMPL_MEDIA_SSE_PSHIFTXX psllq
4066IEMIMPL_MEDIA_SSE_PSHIFTXX psrlw
4067IEMIMPL_MEDIA_SSE_PSHIFTXX psrld
4068IEMIMPL_MEDIA_SSE_PSHIFTXX psrlq
4069IEMIMPL_MEDIA_SSE_PSHIFTXX psraw
4070IEMIMPL_MEDIA_SSE_PSHIFTXX psrad
4071IEMIMPL_MEDIA_SSE_PSHIFTXX pslldq
4072IEMIMPL_MEDIA_SSE_PSHIFTXX psrldq
4073
4074
4075;
4076; Move byte mask.
4077;
4078
4079BEGINPROC_FASTCALL iemAImpl_pmovmskb_u64, 8
4080 PROLOGUE_2_ARGS
4081 IEMIMPL_MMX_PROLOGUE
4082
4083 movq mm1, [A1]
4084 pmovmskb T0, mm1
4085 mov [A0], T0
4086%ifdef RT_ARCH_X86
4087 mov dword [A0 + 4], 0
4088%endif
4089 IEMIMPL_MMX_EPILOGUE
4090 EPILOGUE_2_ARGS
4091ENDPROC iemAImpl_pmovmskb_u64
4092
4093BEGINPROC_FASTCALL iemAImpl_pmovmskb_u128, 8
4094 PROLOGUE_2_ARGS
4095 IEMIMPL_SSE_PROLOGUE
4096
4097 movdqu xmm1, [A1]
4098 pmovmskb T0, xmm1
4099 mov [A0], T0
4100%ifdef RT_ARCH_X86
4101 mov dword [A0 + 4], 0
4102%endif
4103 IEMIMPL_SSE_EPILOGUE
4104 EPILOGUE_2_ARGS
4105ENDPROC iemAImpl_pmovmskb_u128
4106
4107BEGINPROC_FASTCALL iemAImpl_vpmovmskb_u256, 8
4108 PROLOGUE_2_ARGS
4109 IEMIMPL_AVX_PROLOGUE
4110
4111 vmovdqu ymm1, [A1]
4112 vpmovmskb T0, ymm1
4113 mov [A0], T0
4114%ifdef RT_ARCH_X86
4115 mov dword [A0 + 4], 0
4116%endif
4117 IEMIMPL_AVX_EPILOGUE
4118 EPILOGUE_2_ARGS
4119ENDPROC iemAImpl_vpmovmskb_u256
4120
4121
4122;;
4123; Media instruction working on two full sized source registers and one destination (AVX).
4124;
4125; @param 1 The instruction
4126;
4127; @param A0 Pointer to the extended CPU/FPU state (X86XSAVEAREA).
4128; @param A1 Pointer to the destination media register size operand (output).
4129; @param A2 Pointer to the first source media register size operand (input).
4130; @param A3 Pointer to the second source media register size operand (input).
4131;
4132%macro IEMIMPL_MEDIA_F3 1
4133BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4134 PROLOGUE_4_ARGS
4135 IEMIMPL_AVX_PROLOGUE
4136
4137 vmovdqu xmm0, [A2]
4138 vmovdqu xmm1, [A3]
4139 %1 xmm0, xmm0, xmm1
4140 vmovdqu [A1], xmm0
4141
4142 IEMIMPL_AVX_PROLOGUE
4143 EPILOGUE_4_ARGS
4144ENDPROC iemAImpl_ %+ %1 %+ _u128
4145
4146BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4147 PROLOGUE_4_ARGS
4148 IEMIMPL_AVX_PROLOGUE
4149
4150 vmovdqu ymm0, [A2]
4151 vmovdqu ymm1, [A3]
4152 %1 ymm0, ymm0, ymm1
4153 vmovdqu [A1], ymm0
4154
4155 IEMIMPL_AVX_PROLOGUE
4156 EPILOGUE_4_ARGS
4157ENDPROC iemAImpl_ %+ %1 %+ _u256
4158%endmacro
4159
4160IEMIMPL_MEDIA_F3 vpshufb
4161IEMIMPL_MEDIA_F3 vpand
4162IEMIMPL_MEDIA_F3 vpminub
4163IEMIMPL_MEDIA_F3 vpminuw
4164IEMIMPL_MEDIA_F3 vpminud
4165IEMIMPL_MEDIA_F3 vpminsb
4166IEMIMPL_MEDIA_F3 vpminsw
4167IEMIMPL_MEDIA_F3 vpminsd
4168IEMIMPL_MEDIA_F3 vpmaxub
4169IEMIMPL_MEDIA_F3 vpmaxuw
4170IEMIMPL_MEDIA_F3 vpmaxud
4171IEMIMPL_MEDIA_F3 vpmaxsb
4172IEMIMPL_MEDIA_F3 vpmaxsw
4173IEMIMPL_MEDIA_F3 vpmaxsd
4174IEMIMPL_MEDIA_F3 vpandn
4175IEMIMPL_MEDIA_F3 vpor
4176IEMIMPL_MEDIA_F3 vpxor
4177IEMIMPL_MEDIA_F3 vpcmpeqb
4178IEMIMPL_MEDIA_F3 vpcmpeqw
4179IEMIMPL_MEDIA_F3 vpcmpeqd
4180IEMIMPL_MEDIA_F3 vpcmpeqq
4181IEMIMPL_MEDIA_F3 vpcmpgtb
4182IEMIMPL_MEDIA_F3 vpcmpgtw
4183IEMIMPL_MEDIA_F3 vpcmpgtd
4184IEMIMPL_MEDIA_F3 vpcmpgtq
4185IEMIMPL_MEDIA_F3 vpaddb
4186IEMIMPL_MEDIA_F3 vpaddw
4187IEMIMPL_MEDIA_F3 vpaddd
4188IEMIMPL_MEDIA_F3 vpaddq
4189IEMIMPL_MEDIA_F3 vpsubb
4190IEMIMPL_MEDIA_F3 vpsubw
4191IEMIMPL_MEDIA_F3 vpsubd
4192IEMIMPL_MEDIA_F3 vpsubq
4193
4194
4195;;
4196; Media instruction working on two full sized source registers and one destination (AVX),
4197; but no XSAVE state pointer argument.
4198;
4199; @param 1 The instruction
4200;
4201; @param A0 Pointer to the destination media register size operand (output).
4202; @param A1 Pointer to the first source media register size operand (input).
4203; @param A2 Pointer to the second source media register size operand (input).
4204;
4205%macro IEMIMPL_MEDIA_OPT_F3 1
4206BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4207 PROLOGUE_3_ARGS
4208 IEMIMPL_AVX_PROLOGUE
4209
4210 vmovdqu xmm0, [A1]
4211 vmovdqu xmm1, [A2]
4212 %1 xmm0, xmm0, xmm1
4213 vmovdqu [A0], xmm0
4214
4215 IEMIMPL_AVX_PROLOGUE
4216 EPILOGUE_3_ARGS
4217ENDPROC iemAImpl_ %+ %1 %+ _u128
4218
4219BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4220 PROLOGUE_3_ARGS
4221 IEMIMPL_AVX_PROLOGUE
4222
4223 vmovdqu ymm0, [A1]
4224 vmovdqu ymm1, [A2]
4225 %1 ymm0, ymm0, ymm1
4226 vmovdqu [A0], ymm0
4227
4228 IEMIMPL_AVX_PROLOGUE
4229 EPILOGUE_3_ARGS
4230ENDPROC iemAImpl_ %+ %1 %+ _u256
4231%endmacro
4232
4233IEMIMPL_MEDIA_OPT_F3 vpacksswb
4234IEMIMPL_MEDIA_OPT_F3 vpackssdw
4235IEMIMPL_MEDIA_OPT_F3 vpackuswb
4236IEMIMPL_MEDIA_OPT_F3 vpackusdw
4237IEMIMPL_MEDIA_OPT_F3 vpmullw
4238IEMIMPL_MEDIA_OPT_F3 vpmulld
4239IEMIMPL_MEDIA_OPT_F3 vpmulhw
4240IEMIMPL_MEDIA_OPT_F3 vpmulhuw
4241IEMIMPL_MEDIA_OPT_F3 vpavgb
4242IEMIMPL_MEDIA_OPT_F3 vpavgw
4243IEMIMPL_MEDIA_OPT_F3 vpsignb
4244IEMIMPL_MEDIA_OPT_F3 vpsignw
4245IEMIMPL_MEDIA_OPT_F3 vpsignd
4246IEMIMPL_MEDIA_OPT_F3 vphaddw
4247IEMIMPL_MEDIA_OPT_F3 vphaddd
4248IEMIMPL_MEDIA_OPT_F3 vphsubw
4249IEMIMPL_MEDIA_OPT_F3 vphsubd
4250IEMIMPL_MEDIA_OPT_F3 vphaddsw
4251IEMIMPL_MEDIA_OPT_F3 vphsubsw
4252IEMIMPL_MEDIA_OPT_F3 vpmaddubsw
4253IEMIMPL_MEDIA_OPT_F3 vpmulhrsw
4254IEMIMPL_MEDIA_OPT_F3 vpsadbw
4255IEMIMPL_MEDIA_OPT_F3 vpmuldq
4256IEMIMPL_MEDIA_OPT_F3 vpmuludq
4257IEMIMPL_MEDIA_OPT_F3 vunpcklps
4258IEMIMPL_MEDIA_OPT_F3 vunpcklpd
4259IEMIMPL_MEDIA_OPT_F3 vunpckhps
4260IEMIMPL_MEDIA_OPT_F3 vunpckhpd
4261
4262;;
4263; Media instruction working on one full sized source registers and one destination (AVX),
4264; but no XSAVE state pointer argument.
4265;
4266; @param 1 The instruction
4267; @param 2 Flag whether the isntruction has a 256-bit (AVX2) variant (1) or not (0).
4268;
4269; @param A0 Pointer to the destination media register size operand (output).
4270; @param A1 Pointer to the source media register size operand (input).
4271;
4272%macro IEMIMPL_MEDIA_OPT_F2_AVX 2
4273BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4274 PROLOGUE_2_ARGS
4275 IEMIMPL_AVX_PROLOGUE
4276
4277 vmovdqu xmm0, [A1]
4278 %1 xmm0, xmm0
4279 vmovdqu [A0], xmm0
4280
4281 IEMIMPL_AVX_PROLOGUE
4282 EPILOGUE_2_ARGS
4283ENDPROC iemAImpl_ %+ %1 %+ _u128
4284
4285 %if %2 == 1
4286BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 12
4287 PROLOGUE_2_ARGS
4288 IEMIMPL_AVX_PROLOGUE
4289
4290 vmovdqu ymm0, [A1]
4291 %1 ymm0, ymm0
4292 vmovdqu [A0], ymm0
4293
4294 IEMIMPL_AVX_PROLOGUE
4295 EPILOGUE_2_ARGS
4296ENDPROC iemAImpl_ %+ %1 %+ _u256
4297 %endif
4298%endmacro
4299
4300IEMIMPL_MEDIA_OPT_F2_AVX vpabsb, 1
4301IEMIMPL_MEDIA_OPT_F2_AVX vpabsw, 1
4302IEMIMPL_MEDIA_OPT_F2_AVX vpabsd, 1
4303IEMIMPL_MEDIA_OPT_F2_AVX vphminposuw, 0
4304
4305
4306;
4307; The SSE 4.2 crc32
4308;
4309; @param A1 Pointer to the 32-bit destination.
4310; @param A2 The source operand, sized according to the suffix.
4311;
4312BEGINPROC_FASTCALL iemAImpl_crc32_u8, 8
4313 PROLOGUE_2_ARGS
4314
4315 mov T0_32, [A0]
4316 crc32 T0_32, A1_8
4317 mov [A0], T0_32
4318
4319 EPILOGUE_2_ARGS
4320ENDPROC iemAImpl_crc32_u8
4321
4322BEGINPROC_FASTCALL iemAImpl_crc32_u16, 8
4323 PROLOGUE_2_ARGS
4324
4325 mov T0_32, [A0]
4326 crc32 T0_32, A1_16
4327 mov [A0], T0_32
4328
4329 EPILOGUE_2_ARGS
4330ENDPROC iemAImpl_crc32_u16
4331
4332BEGINPROC_FASTCALL iemAImpl_crc32_u32, 8
4333 PROLOGUE_2_ARGS
4334
4335 mov T0_32, [A0]
4336 crc32 T0_32, A1_32
4337 mov [A0], T0_32
4338
4339 EPILOGUE_2_ARGS
4340ENDPROC iemAImpl_crc32_u32
4341
4342%ifdef RT_ARCH_AMD64
4343BEGINPROC_FASTCALL iemAImpl_crc32_u64, 8
4344 PROLOGUE_2_ARGS
4345
4346 mov T0_32, [A0]
4347 crc32 T0, A1
4348 mov [A0], T0_32
4349
4350 EPILOGUE_2_ARGS
4351ENDPROC iemAImpl_crc32_u64
4352%endif
4353
4354
4355;
4356; PTEST (SSE 4.1)
4357;
4358; @param A0 Pointer to the first source operand (aka readonly destination).
4359; @param A1 Pointer to the second source operand.
4360; @param A2 Pointer to the EFLAGS register.
4361;
4362BEGINPROC_FASTCALL iemAImpl_ptest_u128, 12
4363 PROLOGUE_3_ARGS
4364 IEMIMPL_SSE_PROLOGUE
4365
4366 movdqu xmm0, [A0]
4367 movdqu xmm1, [A1]
4368 ptest xmm0, xmm1
4369 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4370
4371 IEMIMPL_SSE_EPILOGUE
4372 EPILOGUE_3_ARGS
4373ENDPROC iemAImpl_ptest_u128
4374
4375BEGINPROC_FASTCALL iemAImpl_vptest_u256, 12
4376 PROLOGUE_3_ARGS
4377 IEMIMPL_SSE_PROLOGUE
4378
4379 vmovdqu ymm0, [A0]
4380 vmovdqu ymm1, [A1]
4381 vptest ymm0, ymm1
4382 IEM_SAVE_FLAGS A2, X86_EFL_STATUS_BITS, 0
4383
4384 IEMIMPL_SSE_EPILOGUE
4385 EPILOGUE_3_ARGS
4386ENDPROC iemAImpl_vptest_u256
4387
4388
4389;;
4390; Template for the [v]pmov{s,z}x* instructions
4391;
4392; @param 1 The instruction
4393;
4394; @param A0 Pointer to the destination media register size operand (output).
4395; @param A1 The source operand value (input).
4396;
4397%macro IEMIMPL_V_PMOV_SZ_X 1
4398BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4399 PROLOGUE_2_ARGS
4400 IEMIMPL_SSE_PROLOGUE
4401
4402 movd xmm0, A1
4403 %1 xmm0, xmm0
4404 vmovdqu [A0], xmm0
4405
4406 IEMIMPL_SSE_PROLOGUE
4407 EPILOGUE_2_ARGS
4408ENDPROC iemAImpl_ %+ %1 %+ _u128
4409
4410BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4411 PROLOGUE_2_ARGS
4412 IEMIMPL_AVX_PROLOGUE
4413
4414 movd xmm0, A1
4415 v %+ %1 xmm0, xmm0
4416 vmovdqu [A0], xmm0
4417
4418 IEMIMPL_AVX_PROLOGUE
4419 EPILOGUE_2_ARGS
4420ENDPROC iemAImpl_v %+ %1 %+ _u128
4421
4422BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4423 PROLOGUE_2_ARGS
4424 IEMIMPL_AVX_PROLOGUE
4425
4426 movdqu xmm0, [A1]
4427 v %+ %1 ymm0, xmm0
4428 vmovdqu [A0], ymm0
4429
4430 IEMIMPL_AVX_PROLOGUE
4431 EPILOGUE_2_ARGS
4432ENDPROC iemAImpl_v %+ %1 %+ _u256
4433%endmacro
4434
4435IEMIMPL_V_PMOV_SZ_X pmovsxbw
4436IEMIMPL_V_PMOV_SZ_X pmovsxbd
4437IEMIMPL_V_PMOV_SZ_X pmovsxbq
4438IEMIMPL_V_PMOV_SZ_X pmovsxwd
4439IEMIMPL_V_PMOV_SZ_X pmovsxwq
4440IEMIMPL_V_PMOV_SZ_X pmovsxdq
4441
4442IEMIMPL_V_PMOV_SZ_X pmovzxbw
4443IEMIMPL_V_PMOV_SZ_X pmovzxbd
4444IEMIMPL_V_PMOV_SZ_X pmovzxbq
4445IEMIMPL_V_PMOV_SZ_X pmovzxwd
4446IEMIMPL_V_PMOV_SZ_X pmovzxwq
4447IEMIMPL_V_PMOV_SZ_X pmovzxdq
4448
4449
4450;;
4451; Need to move this as well somewhere better?
4452;
4453struc IEMSSERESULT
4454 .uResult resd 4
4455 .MXCSR resd 1
4456endstruc
4457
4458
4459;;
4460; Need to move this as well somewhere better?
4461;
4462struc IEMAVX128RESULT
4463 .uResult resd 4
4464 .MXCSR resd 1
4465endstruc
4466
4467
4468;;
4469; Need to move this as well somewhere better?
4470;
4471struc IEMAVX256RESULT
4472 .uResult resd 8
4473 .MXCSR resd 1
4474endstruc
4475
4476
4477;;
4478; Initialize the SSE MXCSR register using the guest value partially to
4479; account for rounding mode.
4480;
4481; @uses 4 bytes of stack to save the original value, T0.
4482; @param 1 Expression giving the address of the FXSTATE of the guest.
4483;
4484%macro SSE_LD_FXSTATE_MXCSR 1
4485 sub xSP, 4
4486
4487 stmxcsr [xSP]
4488 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4489 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4490 or T0_32, X86_MXCSR_XCPT_MASK
4491 sub xSP, 4
4492 mov [xSP], T0_32
4493 ldmxcsr [xSP]
4494 add xSP, 4
4495%endmacro
4496
4497
4498;;
4499; Restores the SSE MXCSR register with the original value.
4500;
4501; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
4502; @param 1 Expression giving the address where to return the MXCSR value.
4503; @param 2 Expression giving the address of the FXSTATE of the guest.
4504;
4505; @note Restores the stack pointer.
4506;
4507%macro SSE_ST_FXSTATE_MXCSR 2
4508 sub xSP, 4
4509 stmxcsr [xSP]
4510 mov T0_32, [xSP]
4511 add xSP, 4
4512 ; Merge the status bits into the original MXCSR value.
4513 mov T1_32, [%2 + X86FXSTATE.MXCSR]
4514 and T0_32, X86_MXCSR_XCPT_FLAGS
4515 or T0_32, T1_32
4516 mov [%1 + IEMSSERESULT.MXCSR], T0_32
4517
4518 ldmxcsr [xSP]
4519 add xSP, 4
4520%endmacro
4521
4522
4523;;
4524; Initialize the SSE MXCSR register using the guest value partially to
4525; account for rounding mode.
4526;
4527; @uses 4 bytes of stack to save the original value.
4528; @param 1 Expression giving the address of the FXSTATE of the guest.
4529;
4530%macro AVX_LD_XSAVEAREA_MXCSR 1
4531 sub xSP, 4
4532
4533 stmxcsr [xSP]
4534 mov T0_32, [%1 + X86FXSTATE.MXCSR]
4535 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
4536 sub xSP, 4
4537 mov [xSP], T0_32
4538 ldmxcsr [xSP]
4539 add xSP, 4
4540%endmacro
4541
4542
4543;;
4544; Restores the AVX128 MXCSR register with the original value.
4545;
4546; @param 1 Expression giving the address where to return the MXCSR value.
4547;
4548; @note Restores the stack pointer.
4549;
4550%macro AVX128_ST_XSAVEAREA_MXCSR 1
4551 stmxcsr [%1 + IEMAVX128RESULT.MXCSR]
4552
4553 ldmxcsr [xSP]
4554 add xSP, 4
4555%endmacro
4556
4557
4558;;
4559; Restores the AVX256 MXCSR register with the original value.
4560;
4561; @param 1 Expression giving the address where to return the MXCSR value.
4562;
4563; @note Restores the stack pointer.
4564;
4565%macro AVX256_ST_XSAVEAREA_MXCSR 1
4566 stmxcsr [%1 + IEMAVX256RESULT.MXCSR]
4567
4568 ldmxcsr [xSP]
4569 add xSP, 4
4570%endmacro
4571
4572
4573;;
4574; Floating point instruction working on two full sized registers.
4575;
4576; @param 1 The instruction
4577; @param 2 Flag whether the AVX variant of the instruction takes two or three operands, 0 to disable AVX variants
4578;
4579; @param A0 FPU context (FXSTATE or XSAVEAREA).
4580; @param A1 Where to return the result including the MXCSR value.
4581; @param A2 Pointer to the first media register size operand (input/output).
4582; @param A3 Pointer to the second media register size operand (input).
4583;
4584%macro IEMIMPL_FP_F2 2
4585BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 12
4586 PROLOGUE_4_ARGS
4587 IEMIMPL_SSE_PROLOGUE
4588 SSE_LD_FXSTATE_MXCSR A0
4589
4590 movdqu xmm0, [A2]
4591 movdqu xmm1, [A3]
4592 %1 xmm0, xmm1
4593 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4594
4595 SSE_ST_FXSTATE_MXCSR A1, A0
4596 IEMIMPL_SSE_PROLOGUE
4597 EPILOGUE_4_ARGS
4598ENDPROC iemAImpl_ %+ %1 %+ _u128
4599
4600 %if %2 == 3
4601BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4602 PROLOGUE_4_ARGS
4603 IEMIMPL_AVX_PROLOGUE
4604 AVX_LD_XSAVEAREA_MXCSR A0
4605
4606 vmovdqu xmm0, [A2]
4607 vmovdqu xmm1, [A3]
4608 v %+ %1 xmm0, xmm0, xmm1
4609 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4610
4611 AVX128_ST_XSAVEAREA_MXCSR A1
4612 IEMIMPL_AVX_PROLOGUE
4613 EPILOGUE_4_ARGS
4614ENDPROC iemAImpl_v %+ %1 %+ _u128
4615
4616BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4617 PROLOGUE_4_ARGS
4618 IEMIMPL_AVX_PROLOGUE
4619 AVX_LD_XSAVEAREA_MXCSR A0
4620
4621 vmovdqu ymm0, [A2]
4622 vmovdqu ymm1, [A3]
4623 v %+ %1 ymm0, ymm0, ymm1
4624 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4625
4626 AVX256_ST_XSAVEAREA_MXCSR A1
4627 IEMIMPL_AVX_PROLOGUE
4628 EPILOGUE_4_ARGS
4629ENDPROC iemAImpl_v %+ %1 %+ _u256
4630 %elif %2 == 2
4631BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 12
4632 PROLOGUE_4_ARGS
4633 IEMIMPL_AVX_PROLOGUE
4634 AVX_LD_XSAVEAREA_MXCSR A0
4635
4636 vmovdqu xmm0, [A2]
4637 vmovdqu xmm1, [A3]
4638 v %+ %1 xmm0, xmm1
4639 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4640
4641 AVX128_ST_XSAVEAREA_MXCSR A1
4642 IEMIMPL_AVX_PROLOGUE
4643 EPILOGUE_4_ARGS
4644ENDPROC iemAImpl_v %+ %1 %+ _u128
4645
4646BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 12
4647 PROLOGUE_4_ARGS
4648 IEMIMPL_AVX_PROLOGUE
4649 AVX_LD_XSAVEAREA_MXCSR A0
4650
4651 vmovdqu ymm0, [A2]
4652 vmovdqu ymm1, [A3]
4653 v %+ %1 ymm0, ymm1
4654 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4655
4656 AVX256_ST_XSAVEAREA_MXCSR A1
4657 IEMIMPL_AVX_PROLOGUE
4658 EPILOGUE_4_ARGS
4659ENDPROC iemAImpl_v %+ %1 %+ _u256
4660 %endif
4661%endmacro
4662
4663IEMIMPL_FP_F2 addps, 3
4664IEMIMPL_FP_F2 addpd, 3
4665IEMIMPL_FP_F2 mulps, 3
4666IEMIMPL_FP_F2 mulpd, 3
4667IEMIMPL_FP_F2 subps, 3
4668IEMIMPL_FP_F2 subpd, 3
4669IEMIMPL_FP_F2 minps, 3
4670IEMIMPL_FP_F2 minpd, 3
4671IEMIMPL_FP_F2 divps, 3
4672IEMIMPL_FP_F2 divpd, 3
4673IEMIMPL_FP_F2 maxps, 3
4674IEMIMPL_FP_F2 maxpd, 3
4675IEMIMPL_FP_F2 haddps, 3
4676IEMIMPL_FP_F2 haddpd, 3
4677IEMIMPL_FP_F2 hsubps, 3
4678IEMIMPL_FP_F2 hsubpd, 3
4679IEMIMPL_FP_F2 addsubps, 3
4680IEMIMPL_FP_F2 addsubpd, 3
4681
4682
4683;;
4684; These are actually unary operations but to keep it simple
4685; we treat them as binary for now, so the output result is
4686; always in sync with the register where the result might get written
4687; to.
4688IEMIMPL_FP_F2 sqrtps, 2
4689IEMIMPL_FP_F2 rsqrtps, 2
4690IEMIMPL_FP_F2 sqrtpd, 2
4691IEMIMPL_FP_F2 cvtdq2ps, 2
4692IEMIMPL_FP_F2 cvtps2dq, 2
4693IEMIMPL_FP_F2 cvttps2dq, 2
4694IEMIMPL_FP_F2 cvttpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4695IEMIMPL_FP_F2 cvtdq2pd, 0 ; @todo AVX variants due to register size differences missing right now
4696IEMIMPL_FP_F2 cvtpd2dq, 0 ; @todo AVX variants due to register size differences missing right now
4697
4698
4699;;
4700; Floating point instruction working on a full sized register and a single precision operand.
4701;
4702; @param 1 The instruction
4703;
4704; @param A0 FPU context (FXSTATE or XSAVEAREA).
4705; @param A1 Where to return the result including the MXCSR value.
4706; @param A2 Pointer to the first media register size operand (input/output).
4707; @param A3 Pointer to the second single precision floating point value (input).
4708;
4709%macro IEMIMPL_FP_F2_R32 1
4710BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r32, 16
4711 PROLOGUE_4_ARGS
4712 IEMIMPL_SSE_PROLOGUE
4713 SSE_LD_FXSTATE_MXCSR A0
4714
4715 movdqu xmm0, [A2]
4716 movd xmm1, [A3]
4717 %1 xmm0, xmm1
4718 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4719
4720 SSE_ST_FXSTATE_MXCSR A1, A0
4721 IEMIMPL_SSE_EPILOGUE
4722 EPILOGUE_4_ARGS
4723ENDPROC iemAImpl_ %+ %1 %+ _u128_r32
4724
4725BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r32, 16
4726 PROLOGUE_4_ARGS
4727 IEMIMPL_AVX_PROLOGUE
4728 AVX_LD_XSAVEAREA_MXCSR A0
4729
4730 vmovdqu xmm0, [A2]
4731 vmovd xmm1, [A3]
4732 v %+ %1 xmm0, xmm0, xmm1
4733 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4734
4735 AVX128_ST_XSAVEAREA_MXCSR A1
4736 IEMIMPL_AVX_PROLOGUE
4737 EPILOGUE_4_ARGS
4738ENDPROC iemAImpl_v %+ %1 %+ _u128_r32
4739%endmacro
4740
4741IEMIMPL_FP_F2_R32 addss
4742IEMIMPL_FP_F2_R32 mulss
4743IEMIMPL_FP_F2_R32 subss
4744IEMIMPL_FP_F2_R32 minss
4745IEMIMPL_FP_F2_R32 divss
4746IEMIMPL_FP_F2_R32 maxss
4747IEMIMPL_FP_F2_R32 cvtss2sd
4748IEMIMPL_FP_F2_R32 sqrtss
4749IEMIMPL_FP_F2_R32 rsqrtss
4750
4751
4752;;
4753; Floating point instruction working on a full sized register and a double precision operand.
4754;
4755; @param 1 The instruction
4756;
4757; @param A0 FPU context (FXSTATE or XSAVEAREA).
4758; @param A1 Where to return the result including the MXCSR value.
4759; @param A2 Pointer to the first media register size operand (input/output).
4760; @param A3 Pointer to the second double precision floating point value (input).
4761;
4762%macro IEMIMPL_FP_F2_R64 1
4763BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128_r64, 16
4764 PROLOGUE_4_ARGS
4765 IEMIMPL_SSE_PROLOGUE
4766 SSE_LD_FXSTATE_MXCSR A0
4767
4768 movdqu xmm0, [A2]
4769 movq xmm1, [A3]
4770 %1 xmm0, xmm1
4771 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4772
4773 SSE_ST_FXSTATE_MXCSR A1, A0
4774 IEMIMPL_SSE_EPILOGUE
4775 EPILOGUE_4_ARGS
4776ENDPROC iemAImpl_ %+ %1 %+ _u128_r64
4777
4778BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128_r64, 16
4779 PROLOGUE_4_ARGS
4780 IEMIMPL_AVX_PROLOGUE
4781 AVX_LD_XSAVEAREA_MXCSR A0
4782
4783 vmovdqu xmm0, [A2]
4784 vmovq xmm1, [A3]
4785 v %+ %1 xmm0, xmm0, xmm1
4786 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4787
4788 AVX128_ST_XSAVEAREA_MXCSR A1
4789 IEMIMPL_AVX_EPILOGUE
4790 EPILOGUE_4_ARGS
4791ENDPROC iemAImpl_v %+ %1 %+ _u128_r64
4792%endmacro
4793
4794IEMIMPL_FP_F2_R64 addsd
4795IEMIMPL_FP_F2_R64 mulsd
4796IEMIMPL_FP_F2_R64 subsd
4797IEMIMPL_FP_F2_R64 minsd
4798IEMIMPL_FP_F2_R64 divsd
4799IEMIMPL_FP_F2_R64 maxsd
4800IEMIMPL_FP_F2_R64 cvtsd2ss
4801IEMIMPL_FP_F2_R64 sqrtsd
4802
4803
4804;;
4805; Macro for the cvtpd2ps/cvtps2pd instructions.
4806;
4807; 1 The instruction name.
4808; 2 Whether the AVX256 result is 128-bit (0) or 256-bit (1).
4809;
4810; @param A0 FPU context (FXSTATE or XSAVEAREA).
4811; @param A1 Where to return the result including the MXCSR value.
4812; @param A2 Pointer to the first media register size operand (input/output).
4813; @param A3 Pointer to the second media register size operand (input).
4814;
4815%macro IEMIMPL_CVT_F2 2
4816BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4817 PROLOGUE_4_ARGS
4818 IEMIMPL_SSE_PROLOGUE
4819 SSE_LD_FXSTATE_MXCSR A0
4820
4821 movdqu xmm0, [A2]
4822 movdqu xmm1, [A3]
4823 %1 xmm0, xmm1
4824 movdqu [A1 + IEMSSERESULT.uResult], xmm0
4825
4826 SSE_ST_FXSTATE_MXCSR A1, A0
4827 IEMIMPL_SSE_EPILOGUE
4828 EPILOGUE_4_ARGS
4829ENDPROC iemAImpl_ %+ %1 %+ _u128
4830
4831BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u128, 16
4832 PROLOGUE_4_ARGS
4833 IEMIMPL_AVX_PROLOGUE
4834 AVX_LD_XSAVEAREA_MXCSR A0
4835
4836 vmovdqu xmm0, [A2]
4837 vmovdqu xmm1, [A3]
4838 v %+ %1 xmm0, xmm1
4839 vmovdqu [A1 + IEMAVX128RESULT.uResult], xmm0
4840
4841 AVX128_ST_XSAVEAREA_MXCSR A1
4842 IEMIMPL_AVX_EPILOGUE
4843 EPILOGUE_4_ARGS
4844ENDPROC iemAImpl_v %+ %1 %+ _u128
4845
4846BEGINPROC_FASTCALL iemAImpl_v %+ %1 %+ _u256, 16
4847 PROLOGUE_4_ARGS
4848 IEMIMPL_AVX_PROLOGUE
4849 AVX_LD_XSAVEAREA_MXCSR A0
4850
4851 vmovdqu ymm0, [A2]
4852 vmovdqu ymm1, [A3]
4853 %if %2 == 0
4854 v %+ %1 xmm0, ymm1
4855 %else
4856 v %+ %1 ymm0, xmm1
4857 %endif
4858 vmovdqu [A1 + IEMAVX256RESULT.uResult], ymm0
4859
4860 AVX256_ST_XSAVEAREA_MXCSR A1
4861 IEMIMPL_AVX_EPILOGUE
4862 EPILOGUE_4_ARGS
4863ENDPROC iemAImpl_v %+ %1 %+ _u256
4864%endmacro
4865
4866IEMIMPL_CVT_F2 cvtpd2ps, 0
4867IEMIMPL_CVT_F2 cvtps2pd, 1
4868
4869
4870;;
4871; shufps instructions with 8-bit immediates.
4872;
4873; @param A0 Pointer to the destination media register size operand (input/output).
4874; @param A1 Pointer to the first source media register size operand (input).
4875; @param A2 The 8-bit immediate
4876;
4877BEGINPROC_FASTCALL iemAImpl_shufps_u128, 16
4878 PROLOGUE_3_ARGS
4879 IEMIMPL_SSE_PROLOGUE
4880
4881 movdqu xmm0, [A0]
4882 movdqu xmm1, [A1]
4883 lea T1, [.imm0 xWrtRIP]
4884 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret+int3) == 6: (A2 * 3) *2
4885 lea T1, [T1 + T0*2]
4886 call T1
4887 movdqu [A0], xmm0
4888
4889 IEMIMPL_SSE_EPILOGUE
4890 EPILOGUE_3_ARGS
4891 %assign bImm 0
4892 %rep 256
4893.imm %+ bImm:
4894 shufps xmm0, xmm1, bImm
4895 ret
4896 int3
4897 %assign bImm bImm + 1
4898 %endrep
4899.immEnd: ; 256*6 == 0x600
4900dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4901dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4902ENDPROC iemAImpl_shufps_u128
4903
4904
4905;;
4906; shufpd instruction with 8-bit immediates.
4907;
4908; @param A0 Pointer to the destination media register size operand (input/output).
4909; @param A1 Pointer to the first source media register size operand (input).
4910; @param A2 The 8-bit immediate
4911;
4912BEGINPROC_FASTCALL iemAImpl_shufpd_u128, 16
4913 PROLOGUE_3_ARGS
4914 IEMIMPL_SSE_PROLOGUE
4915
4916 movdqu xmm0, [A0]
4917 movdqu xmm1, [A1]
4918 lea T1, [.imm0 xWrtRIP]
4919 lea T0, [A2 + A2*2] ; sizeof(shufpX+ret) == 6: (A2 * 3) *2
4920 lea T1, [T1 + T0*2]
4921 call T1
4922 movdqu [A0], xmm0
4923
4924 IEMIMPL_SSE_EPILOGUE
4925 EPILOGUE_3_ARGS
4926 %assign bImm 0
4927 %rep 256
4928.imm %+ bImm:
4929 shufpd xmm0, xmm1, bImm
4930 ret
4931 %assign bImm bImm + 1
4932 %endrep
4933.immEnd: ; 256*6 == 0x600
4934dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4935dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4936ENDPROC iemAImpl_shufpd_u128
4937
4938
4939;;
4940; vshufp{s,d} instructions with 8-bit immediates.
4941;
4942; @param 1 The instruction name.
4943;
4944; @param A0 Pointer to the destination media register size operand (output).
4945; @param A1 Pointer to the first source media register size operand (input).
4946; @param A2 Pointer to the second source media register size operand (input).
4947; @param A3 The 8-bit immediate
4948;
4949%macro IEMIMPL_MEDIA_AVX_VSHUFPX 1
4950BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
4951 PROLOGUE_4_ARGS
4952 IEMIMPL_AVX_PROLOGUE
4953
4954 movdqu xmm0, [A1]
4955 movdqu xmm1, [A2]
4956 lea T1, [.imm0 xWrtRIP]
4957 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4958 lea T1, [T1 + T0*2]
4959 call T1
4960 movdqu [A0], xmm0
4961
4962 IEMIMPL_AVX_EPILOGUE
4963 EPILOGUE_4_ARGS
4964 %assign bImm 0
4965 %rep 256
4966.imm %+ bImm:
4967 %1 xmm0, xmm0, xmm1, bImm
4968 ret
4969 %assign bImm bImm + 1
4970 %endrep
4971.immEnd: ; 256*6 == 0x600
4972dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4973dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
4974ENDPROC iemAImpl_ %+ %1 %+ _u128
4975
4976BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
4977 PROLOGUE_4_ARGS
4978 IEMIMPL_AVX_PROLOGUE
4979
4980 vmovdqu ymm0, [A1]
4981 vmovdqu ymm1, [A2]
4982 lea T1, [.imm0 xWrtRIP]
4983 lea T0, [A3 + A3*2] ; sizeof(vshufpX+ret) == 6: (A3 * 3) *2
4984 lea T1, [T1 + T0*2]
4985 call T1
4986 vmovdqu [A0], ymm0
4987
4988 IEMIMPL_AVX_EPILOGUE
4989 EPILOGUE_4_ARGS
4990 %assign bImm 0
4991 %rep 256
4992.imm %+ bImm:
4993 %1 ymm0, ymm0, ymm1, bImm
4994 ret
4995 %assign bImm bImm + 1
4996 %endrep
4997.immEnd: ; 256*6 == 0x600
4998dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
4999dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5000ENDPROC iemAImpl_ %+ %1 %+ _u256
5001%endmacro
5002
5003IEMIMPL_MEDIA_AVX_VSHUFPX vshufps
5004IEMIMPL_MEDIA_AVX_VSHUFPX vshufpd
5005
5006
5007;;
5008; One of the [p]blendv{b,ps,pd} variants
5009;
5010; @param 1 The instruction
5011;
5012; @param A0 Pointer to the first media register sized operand (input/output).
5013; @param A1 Pointer to the second media sized value (input).
5014; @param A2 Pointer to the media register sized mask value (input).
5015;
5016%macro IEMIMPL_P_BLEND 1
5017BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5018 PROLOGUE_3_ARGS
5019 IEMIMPL_SSE_PROLOGUE
5020
5021 movdqu xmm0, [A2] ; This is implicit
5022 movdqu xmm1, [A0]
5023 movdqu xmm2, [A1] ; @todo Do I need to save the original value here first?
5024 %1 xmm1, xmm2
5025 movdqu [A0], xmm1
5026
5027 IEMIMPL_SSE_PROLOGUE
5028 EPILOGUE_3_ARGS
5029ENDPROC iemAImpl_ %+ %1 %+ _u128
5030%endmacro
5031
5032IEMIMPL_P_BLEND pblendvb
5033IEMIMPL_P_BLEND blendvps
5034IEMIMPL_P_BLEND blendvpd
5035
5036
5037;;
5038; One of the v[p]blendv{b,ps,pd} variants
5039;
5040; @param 1 The instruction
5041;
5042; @param A0 Pointer to the first media register sized operand (output).
5043; @param A1 Pointer to the first media register sized operand (input).
5044; @param A2 Pointer to the second media register sized operand (input).
5045; @param A3 Pointer to the media register sized mask value (input).
5046%macro IEMIMPL_AVX_P_BLEND 1
5047BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5048 PROLOGUE_4_ARGS
5049 IEMIMPL_AVX_PROLOGUE
5050
5051 vmovdqu xmm0, [A1]
5052 vmovdqu xmm1, [A2]
5053 vmovdqu xmm2, [A3]
5054 %1 xmm0, xmm0, xmm1, xmm2
5055 vmovdqu [A0], xmm0
5056
5057 IEMIMPL_AVX_PROLOGUE
5058 EPILOGUE_4_ARGS
5059ENDPROC iemAImpl_ %+ %1 %+ _u128
5060
5061BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5062 PROLOGUE_4_ARGS
5063 IEMIMPL_AVX_PROLOGUE
5064
5065 vmovdqu ymm0, [A1]
5066 vmovdqu ymm1, [A2]
5067 vmovdqu ymm2, [A3]
5068 %1 ymm0, ymm0, ymm1, ymm2
5069 vmovdqu [A0], ymm0
5070
5071 IEMIMPL_AVX_PROLOGUE
5072 EPILOGUE_4_ARGS
5073ENDPROC iemAImpl_ %+ %1 %+ _u256
5074%endmacro
5075
5076IEMIMPL_AVX_P_BLEND vpblendvb
5077IEMIMPL_AVX_P_BLEND vblendvps
5078IEMIMPL_AVX_P_BLEND vblendvpd
5079
5080
5081;;
5082; palignr mm1, mm2/m64 instruction.
5083;
5084; @param A0 Pointer to the first media register sized operand (output).
5085; @param A1 The second register sized operand (input).
5086; @param A2 The 8-bit immediate.
5087BEGINPROC_FASTCALL iemAImpl_palignr_u64, 16
5088 PROLOGUE_3_ARGS
5089 IEMIMPL_MMX_PROLOGUE
5090
5091 movq mm0, [A0]
5092 movq mm1, A1
5093 lea T1, [.imm0 xWrtRIP]
5094 lea T0, [A2 + A2*2] ; sizeof(palignr+ret) == 6: (A2 * 3) *2
5095 lea T1, [T1 + T0*2]
5096 call T1
5097 movq [A0], mm0
5098
5099 IEMIMPL_MMX_EPILOGUE
5100 EPILOGUE_3_ARGS
5101 %assign bImm 0
5102 %rep 256
5103.imm %+ bImm:
5104 palignr mm0, mm1, bImm
5105 ret
5106 %assign bImm bImm + 1
5107 %endrep
5108.immEnd: ; 256*6 == 0x600
5109dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5110dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5111ENDPROC iemAImpl_palignr_u64
5112
5113
5114;;
5115; SSE instructions with 8-bit immediates of the form
5116; xxx xmm1, xmm2, imm8.
5117; where the instruction encoding takes up 6 bytes.
5118;
5119; @param 1 The instruction name.
5120;
5121; @param A0 Pointer to the first media register size operand (input/output).
5122; @param A1 Pointer to the second source media register size operand (input).
5123; @param A2 The 8-bit immediate
5124;
5125%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_6 1
5126BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5127 PROLOGUE_3_ARGS
5128 IEMIMPL_SSE_PROLOGUE
5129
5130 movdqu xmm0, [A0]
5131 movdqu xmm1, [A1]
5132 lea T1, [.imm0 xWrtRIP]
5133 lea T0, [A2 + A2*3] ; sizeof(insnX+ret) == 8: (A2 * 4) * 2
5134 lea T1, [T1 + T0*2]
5135 call T1
5136 movdqu [A0], xmm0
5137
5138 IEMIMPL_SSE_EPILOGUE
5139 EPILOGUE_3_ARGS
5140 %assign bImm 0
5141 %rep 256
5142.imm %+ bImm:
5143 %1 xmm0, xmm1, bImm
5144 ret
5145 int3
5146 %assign bImm bImm + 1
5147 %endrep
5148.immEnd: ; 256*8 == 0x800
5149dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5150dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5151ENDPROC iemAImpl_ %+ %1 %+ _u128
5152%endmacro
5153
5154IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendps
5155IEMIMPL_MEDIA_SSE_INSN_IMM8_6 blendpd
5156IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pblendw
5157IEMIMPL_MEDIA_SSE_INSN_IMM8_6 palignr
5158IEMIMPL_MEDIA_SSE_INSN_IMM8_6 pclmulqdq
5159IEMIMPL_MEDIA_SSE_INSN_IMM8_6 aeskeygenassist
5160
5161
5162;;
5163; AVX instructions with 8-bit immediates of the form
5164; xxx {x,y}mm1, {x,y}mm2, {x,y}mm3, imm8.
5165; where the instruction encoding takes up 6 bytes.
5166;
5167; @param 1 The instruction name.
5168; @param 2 Whether the instruction has a 256-bit variant (1) or not (0).
5169;
5170; @param A0 Pointer to the destination media register size operand (output).
5171; @param A1 Pointer to the first source media register size operand (input).
5172; @param A2 Pointer to the second source media register size operand (input).
5173; @param A3 The 8-bit immediate
5174;
5175%macro IEMIMPL_MEDIA_AVX_INSN_IMM8_6 2
5176BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5177 PROLOGUE_4_ARGS
5178 IEMIMPL_AVX_PROLOGUE
5179
5180 movdqu xmm0, [A1]
5181 movdqu xmm1, [A2]
5182 lea T1, [.imm0 xWrtRIP]
5183 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5184 lea T1, [T1 + T0*2]
5185 call T1
5186 movdqu [A0], xmm0
5187
5188 IEMIMPL_AVX_EPILOGUE
5189 EPILOGUE_4_ARGS
5190 %assign bImm 0
5191 %rep 256
5192.imm %+ bImm:
5193 %1 xmm0, xmm0, xmm1, bImm
5194 ret
5195 int3
5196 %assign bImm bImm + 1
5197 %endrep
5198.immEnd: ; 256*8 == 0x800
5199dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5200dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5201ENDPROC iemAImpl_ %+ %1 %+ _u128
5202
5203 %if %2 == 1
5204BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u256, 16
5205 PROLOGUE_4_ARGS
5206 IEMIMPL_AVX_PROLOGUE
5207
5208 vmovdqu ymm0, [A1]
5209 vmovdqu ymm1, [A2]
5210 lea T1, [.imm0 xWrtRIP]
5211 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5212 lea T1, [T1 + T0*2]
5213 call T1
5214 vmovdqu [A0], ymm0
5215
5216 IEMIMPL_AVX_EPILOGUE
5217 EPILOGUE_4_ARGS
5218 %assign bImm 0
5219 %rep 256
5220.imm %+ bImm:
5221 %1 ymm0, ymm0, ymm1, bImm
5222 ret
5223 int3
5224 %assign bImm bImm + 1
5225 %endrep
5226.immEnd: ; 256*8 == 0x800
5227dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5228dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5229ENDPROC iemAImpl_ %+ %1 %+ _u256
5230 %endif
5231%endmacro
5232
5233IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendps, 1
5234IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vblendpd, 1
5235IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpblendw, 1
5236IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpalignr, 1
5237IEMIMPL_MEDIA_AVX_INSN_IMM8_6 vpclmulqdq, 0
5238
5239
5240;;
5241; Need to move this as well somewhere better?
5242;
5243struc IEMPCMPISTRXSRC
5244 .uSrc1 resd 4
5245 .uSrc2 resd 4
5246endstruc
5247
5248struc IEMPCMPESTRXSRC
5249 .uSrc1 resd 4
5250 .uSrc2 resd 4
5251 .u64Rax resd 2
5252 .u64Rdx resd 2
5253endstruc
5254
5255;;
5256; The pcmpistri instruction.
5257;
5258; @param A0 Pointer to the ECX register to store the result to (output).
5259; @param A1 Pointer to the EFLAGS register.
5260; @param A2 Pointer to the structure containing the source operands (input).
5261; @param A3 The 8-bit immediate
5262;
5263BEGINPROC_FASTCALL iemAImpl_pcmpistri_u128, 16
5264 PROLOGUE_4_ARGS
5265 IEMIMPL_SSE_PROLOGUE
5266
5267 movdqu xmm0, [A2 + IEMPCMPISTRXSRC.uSrc1]
5268 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc2]
5269 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5270 lea T1, [.imm0 xWrtRIP]
5271 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5272 lea T1, [T1 + T0*2]
5273 call T1
5274
5275 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5276 mov [T2], ecx
5277
5278 IEMIMPL_SSE_EPILOGUE
5279 EPILOGUE_4_ARGS
5280 %assign bImm 0
5281 %rep 256
5282.imm %+ bImm:
5283 pcmpistri xmm0, xmm1, bImm
5284 ret
5285 int3
5286 %assign bImm bImm + 1
5287 %endrep
5288.immEnd: ; 256*8 == 0x800
5289dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5290dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5291ENDPROC iemAImpl_pcmpistri_u128
5292
5293;;
5294; The pcmpestri instruction.
5295;
5296; @param A0 Pointer to the ECX register to store the result to (output).
5297; @param A1 Pointer to the EFLAGS register.
5298; @param A2 Pointer to the structure containing the source operands (input).
5299; @param A3 The 8-bit immediate
5300;
5301BEGINPROC_FASTCALL iemAImpl_pcmpestri_u128, 16
5302 PROLOGUE_4_ARGS
5303 IEMIMPL_SSE_PROLOGUE
5304
5305 movdqu xmm0, [A2 + IEMPCMPESTRXSRC.uSrc1]
5306 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc2]
5307 mov T2, A0 ; A0 can be ecx/rcx in some calling conventions which gets overwritten later (T2 only available on AMD64)
5308 lea T1, [.imm0 xWrtRIP]
5309 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5310 lea T1, [T1 + T0*2]
5311 push xDX ; xDX can be A1 or A2 depending on the calling convention
5312 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5313 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5314 call T1
5315
5316 pop xDX
5317 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5318 mov [T2], ecx
5319
5320 IEMIMPL_SSE_EPILOGUE
5321 EPILOGUE_4_ARGS
5322 %assign bImm 0
5323 %rep 256
5324.imm %+ bImm:
5325 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5326 pcmpestri xmm0, xmm1, bImm
5327 ret
5328 %assign bImm bImm + 1
5329 %endrep
5330.immEnd: ; 256*8 == 0x800
5331dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5332dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5333ENDPROC iemAImpl_pcmpestri_u128
5334
5335;;
5336; The pcmpistrm instruction template.
5337;
5338; @param A0 Pointer to the XMM0 register to store the result to (output).
5339; @param A1 Pointer to the EFLAGS register.
5340; @param A2 Pointer to the structure containing the source operands (input).
5341; @param A3 The 8-bit immediate
5342;
5343BEGINPROC_FASTCALL iemAImpl_pcmpistrm_u128, 16
5344 PROLOGUE_4_ARGS
5345 IEMIMPL_SSE_PROLOGUE
5346
5347 movdqu xmm1, [A2 + IEMPCMPISTRXSRC.uSrc1]
5348 movdqu xmm2, [A2 + IEMPCMPISTRXSRC.uSrc2]
5349 lea T1, [.imm0 xWrtRIP]
5350 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5351 lea T1, [T1 + T0*2]
5352 call T1
5353
5354 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5355 movdqu [A0], xmm0
5356
5357 IEMIMPL_SSE_EPILOGUE
5358 EPILOGUE_4_ARGS
5359 %assign bImm 0
5360 %rep 256
5361.imm %+ bImm:
5362 pcmpistrm xmm1, xmm2, bImm
5363 ret
5364 int3
5365 %assign bImm bImm + 1
5366 %endrep
5367.immEnd: ; 256*8 == 0x800
5368dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5369dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5370ENDPROC iemAImpl_pcmpistrm_u128
5371
5372;;
5373; The pcmpestrm instruction template.
5374;
5375; @param A0 Pointer to the XMM0 register to store the result to (output).
5376; @param A1 Pointer to the EFLAGS register.
5377; @param A2 Pointer to the structure containing the source operands (input).
5378; @param A3 The 8-bit immediate
5379;
5380BEGINPROC_FASTCALL iemAImpl_pcmpestrm_u128, 16
5381 PROLOGUE_4_ARGS
5382 IEMIMPL_SSE_PROLOGUE
5383
5384 movdqu xmm1, [A2 + IEMPCMPESTRXSRC.uSrc1]
5385 movdqu xmm2, [A2 + IEMPCMPESTRXSRC.uSrc2]
5386 lea T1, [.imm0 xWrtRIP]
5387 lea T0, [A3 + A3*3] ; sizeof(insnX+ret) == 8: (A3 * 4) * 2
5388 lea T1, [T1 + T0*2]
5389 push xDX ; xDX can be A1 or A2 depending on the calling convention
5390 mov xAX, [A2 + IEMPCMPESTRXSRC.u64Rax] ; T0 is rax, so only overwrite it after we're done using it
5391 mov xDX, [A2 + IEMPCMPESTRXSRC.u64Rdx]
5392 call T1
5393
5394 pop xDX
5395 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5396 movdqu [A0], xmm0
5397
5398 IEMIMPL_SSE_EPILOGUE
5399 EPILOGUE_4_ARGS
5400 %assign bImm 0
5401 %rep 256
5402.imm %+ bImm:
5403 db 0x48 ; Use the REX.W prefix to make pcmpestr{i,m} use full RAX/RDX (would use EAX/EDX only otherwise.)
5404 pcmpestrm xmm1, xmm2, bImm
5405 ret
5406 %assign bImm bImm + 1
5407 %endrep
5408.immEnd: ; 256*8 == 0x800
5409dw 0xf7ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5410dw 0x107ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5411ENDPROC iemAImpl_pcmpestrm_u128
5412
5413
5414;;
5415; pinsrw instruction.
5416;
5417; @param A0 Pointer to the first media register size operand (input/output).
5418; @param A1 The 16 bit input operand (input).
5419; @param A2 The 8-bit immediate
5420;
5421BEGINPROC_FASTCALL iemAImpl_pinsrw_u64, 16
5422 PROLOGUE_3_ARGS
5423 IEMIMPL_SSE_PROLOGUE
5424
5425 movq mm0, [A0]
5426 lea T0, [A2 + A2*4] ; sizeof(pinsrw+ret) == 5
5427 lea T1, [.imm0 xWrtRIP]
5428 lea T1, [T1 + T0]
5429 call T1
5430 movq [A0], mm0
5431
5432 IEMIMPL_SSE_EPILOGUE
5433 EPILOGUE_3_ARGS
5434 %assign bImm 0
5435 %rep 256
5436.imm %+ bImm:
5437 pinsrw mm0, A1_32, bImm
5438 ret
5439 %assign bImm bImm + 1
5440 %endrep
5441.immEnd: ; 256*5 == 0x500
5442dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5443dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5444ENDPROC iemAImpl_pinsrw_u64
5445
5446BEGINPROC_FASTCALL iemAImpl_pinsrw_u128, 16
5447 PROLOGUE_3_ARGS
5448 IEMIMPL_SSE_PROLOGUE
5449
5450 movdqu xmm0, [A0]
5451 lea T1, [.imm0 xWrtRIP]
5452 lea T0, [A2 + A2*2] ; sizeof(pinsrw+ret) == 6: (A2 * 3) *2
5453 lea T1, [T1 + T0*2]
5454 call T1
5455 movdqu [A0], xmm0
5456
5457 IEMIMPL_SSE_EPILOGUE
5458 EPILOGUE_3_ARGS
5459 %assign bImm 0
5460 %rep 256
5461.imm %+ bImm:
5462 pinsrw xmm0, A1_32, bImm
5463 ret
5464 %assign bImm bImm + 1
5465 %endrep
5466.immEnd: ; 256*6 == 0x600
5467dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5468dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5469ENDPROC iemAImpl_pinsrw_u128
5470
5471;;
5472; vpinsrw instruction.
5473;
5474; @param A0 Pointer to the first media register size operand (output).
5475; @param A1 Pointer to the source media register size operand (input).
5476; @param A2 The 16 bit input operand (input).
5477; @param A3 The 8-bit immediate
5478;
5479BEGINPROC_FASTCALL iemAImpl_vpinsrw_u128, 16
5480 PROLOGUE_4_ARGS
5481 IEMIMPL_SSE_PROLOGUE
5482
5483 movdqu xmm0, [A1]
5484 lea T1, [.imm0 xWrtRIP]
5485 lea T0, [A3 + A3*2] ; sizeof(vpinsrw+ret) == 6: (A3 * 3) *2
5486 lea T1, [T1 + T0*2]
5487 mov A1, A2 ; A2 requires longer encoding on Windows
5488 call T1
5489 movdqu [A0], xmm0
5490
5491 IEMIMPL_SSE_EPILOGUE
5492 EPILOGUE_4_ARGS
5493 %assign bImm 0
5494 %rep 256
5495.imm %+ bImm:
5496 vpinsrw xmm0, xmm0, A1_32, bImm
5497 ret
5498 %assign bImm bImm + 1
5499 %endrep
5500.immEnd: ; 256*6 == 0x600
5501dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5502dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5503ENDPROC iemAImpl_vpinsrw_u128
5504
5505
5506;;
5507; pextrw instruction.
5508;
5509; @param A0 Pointer to the 16bit output operand (output).
5510; @param A1 Pointer to the media register size operand (input).
5511; @param A2 The 8-bit immediate
5512;
5513BEGINPROC_FASTCALL iemAImpl_pextrw_u64, 16
5514 PROLOGUE_3_ARGS
5515 IEMIMPL_SSE_PROLOGUE
5516
5517 movq mm0, A1
5518 lea T0, [A2 + A2*4] ; sizeof(pextrw+ret) == 5
5519 lea T1, [.imm0 xWrtRIP]
5520 lea T1, [T1 + T0]
5521 call T1
5522 mov word [A0], T0_16
5523
5524 IEMIMPL_SSE_EPILOGUE
5525 EPILOGUE_3_ARGS
5526 %assign bImm 0
5527 %rep 256
5528.imm %+ bImm:
5529 pextrw T0_32, mm0, bImm
5530 ret
5531 %assign bImm bImm + 1
5532 %endrep
5533.immEnd: ; 256*5 == 0x500
5534dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5535dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5536ENDPROC iemAImpl_pextrw_u64
5537
5538BEGINPROC_FASTCALL iemAImpl_pextrw_u128, 16
5539 PROLOGUE_3_ARGS
5540 IEMIMPL_SSE_PROLOGUE
5541
5542 movdqu xmm0, [A1]
5543 lea T1, [.imm0 xWrtRIP]
5544 lea T0, [A2 + A2*2] ; sizeof(pextrw+ret) == 6: (A2 * 3) *2
5545 lea T1, [T1 + T0*2]
5546 call T1
5547 mov word [A0], T0_16
5548
5549 IEMIMPL_SSE_EPILOGUE
5550 EPILOGUE_3_ARGS
5551 %assign bImm 0
5552 %rep 256
5553.imm %+ bImm:
5554 pextrw T0_32, xmm0, bImm
5555 ret
5556 %assign bImm bImm + 1
5557 %endrep
5558.immEnd: ; 256*6 == 0x600
5559dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5560dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5561ENDPROC iemAImpl_pextrw_u128
5562
5563;;
5564; vpextrw instruction.
5565;
5566; @param A0 Pointer to the 16bit output operand (output).
5567; @param A1 Pointer to the source media register size operand (input).
5568; @param A2 The 8-bit immediate
5569;
5570BEGINPROC_FASTCALL iemAImpl_vpextrw_u128, 16
5571 PROLOGUE_3_ARGS
5572 IEMIMPL_SSE_PROLOGUE
5573
5574 movdqu xmm0, [A1]
5575 lea T1, [.imm0 xWrtRIP]
5576 lea T0, [A2 + A2*2] ; sizeof(vpextrw+ret) == 6: (A2 * 3) *2
5577 lea T1, [T1 + T0*2]
5578 call T1
5579 mov word [A0], T0_16
5580
5581 IEMIMPL_SSE_EPILOGUE
5582 EPILOGUE_3_ARGS
5583 %assign bImm 0
5584 %rep 256
5585.imm %+ bImm:
5586 vpextrw T0_32, xmm0, bImm
5587 ret
5588 %assign bImm bImm + 1
5589 %endrep
5590.immEnd: ; 256*6 == 0x600
5591dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
5592dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
5593ENDPROC iemAImpl_vpextrw_u128
5594
5595
5596;;
5597; movmskp{s,d} SSE instruction template
5598;
5599; @param 1 The SSE instruction name.
5600; @param 2 The AVX instruction name.
5601;
5602; @param A0 Pointer to the output register (output/byte sized).
5603; @param A1 Pointer to the source media register size operand (input).
5604;
5605%macro IEMIMPL_MEDIA_MOVMSK_P 2
5606BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
5607 PROLOGUE_2_ARGS
5608 IEMIMPL_SSE_PROLOGUE
5609
5610 movdqu xmm0, [A1]
5611 %1 T0, xmm0
5612 mov byte [A0], T0_8
5613
5614 IEMIMPL_SSE_EPILOGUE
5615 EPILOGUE_2_ARGS
5616ENDPROC iemAImpl_ %+ %1 %+ _u128
5617
5618BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u128, 16
5619 PROLOGUE_2_ARGS
5620 IEMIMPL_AVX_PROLOGUE
5621
5622 movdqu xmm0, [A1]
5623 %2 T0, xmm0
5624 mov byte [A0], T0_8
5625
5626 IEMIMPL_AVX_EPILOGUE
5627 EPILOGUE_2_ARGS
5628ENDPROC iemAImpl_ %+ %2 %+ _u128
5629
5630BEGINPROC_FASTCALL iemAImpl_ %+ %2 %+ _u256, 16
5631 PROLOGUE_2_ARGS
5632 IEMIMPL_AVX_PROLOGUE
5633
5634 vmovdqu ymm0, [A1]
5635 %2 T0, ymm0
5636 mov byte [A0], T0_8
5637
5638 IEMIMPL_AVX_EPILOGUE
5639 EPILOGUE_2_ARGS
5640ENDPROC iemAImpl_ %+ %2 %+ _u256
5641%endmacro
5642
5643IEMIMPL_MEDIA_MOVMSK_P movmskps, vmovmskps
5644IEMIMPL_MEDIA_MOVMSK_P movmskpd, vmovmskpd
5645
5646
5647;;
5648; Restores the SSE MXCSR register with the original value.
5649;
5650; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5651; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5652; @param 2 Expression giving the address of the FXSTATE of the guest.
5653;
5654; @note Restores the stack pointer.
5655;
5656%macro SSE_ST_FXSTATE_MXCSR_ONLY 2
5657 sub xSP, 4
5658 stmxcsr [xSP]
5659 mov T0_32, [xSP]
5660 add xSP, 4
5661 ; Merge the status bits into the original MXCSR value.
5662 mov T1_32, [%2 + X86FXSTATE.MXCSR]
5663 and T0_32, X86_MXCSR_XCPT_FLAGS
5664 or T0_32, T1_32
5665 mov [%1], T0_32
5666
5667 ldmxcsr [xSP]
5668 add xSP, 4
5669%endmacro
5670
5671
5672;;
5673; cvttsd2si instruction - 32-bit variant.
5674;
5675; @param A0 FPU context (FXSTATE or XSAVEAREA).
5676; @param A1 Where to return the MXCSR value.
5677; @param A2 Pointer to the result operand (output).
5678; @param A3 Pointer to the second operand (input).
5679;
5680BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i32_r64, 16
5681 PROLOGUE_4_ARGS
5682 IEMIMPL_SSE_PROLOGUE
5683 SSE_LD_FXSTATE_MXCSR A0
5684
5685 cvttsd2si T0_32, [A3]
5686 mov dword [A2], T0_32
5687
5688 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5689 IEMIMPL_SSE_EPILOGUE
5690 EPILOGUE_4_ARGS
5691ENDPROC iemAImpl_cvttsd2si_i32_r64
5692
5693;;
5694; cvttsd2si instruction - 64-bit variant.
5695;
5696; @param A0 FPU context (FXSTATE or XSAVEAREA).
5697; @param A1 Where to return the MXCSR value.
5698; @param A2 Pointer to the result operand (output).
5699; @param A3 Pointer to the second operand (input).
5700;
5701BEGINPROC_FASTCALL iemAImpl_cvttsd2si_i64_r64, 16
5702 PROLOGUE_4_ARGS
5703 IEMIMPL_SSE_PROLOGUE
5704 SSE_LD_FXSTATE_MXCSR A0
5705
5706 cvttsd2si T0, [A3]
5707 mov qword [A2], T0
5708
5709 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5710 IEMIMPL_SSE_EPILOGUE
5711 EPILOGUE_4_ARGS
5712ENDPROC iemAImpl_cvttsd2si_i64_r64
5713
5714
5715;;
5716; cvtsd2si instruction - 32-bit variant.
5717;
5718; @param A0 FPU context (FXSTATE or XSAVEAREA).
5719; @param A1 Where to return the MXCSR value.
5720; @param A2 Pointer to the result operand (output).
5721; @param A3 Pointer to the second operand (input).
5722;
5723BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i32_r64, 16
5724 PROLOGUE_4_ARGS
5725 IEMIMPL_SSE_PROLOGUE
5726 SSE_LD_FXSTATE_MXCSR A0
5727
5728 cvtsd2si T0_32, [A3]
5729 mov dword [A2], T0_32
5730
5731 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5732 IEMIMPL_SSE_EPILOGUE
5733 EPILOGUE_4_ARGS
5734ENDPROC iemAImpl_cvtsd2si_i32_r64
5735
5736;;
5737; cvtsd2si instruction - 64-bit variant.
5738;
5739; @param A0 FPU context (FXSTATE or XSAVEAREA).
5740; @param A1 Where to return the MXCSR value.
5741; @param A2 Pointer to the result operand (output).
5742; @param A3 Pointer to the second operand (input).
5743;
5744BEGINPROC_FASTCALL iemAImpl_cvtsd2si_i64_r64, 16
5745 PROLOGUE_4_ARGS
5746 IEMIMPL_SSE_PROLOGUE
5747 SSE_LD_FXSTATE_MXCSR A0
5748
5749 cvtsd2si T0, [A3]
5750 mov qword [A2], T0
5751
5752 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5753 IEMIMPL_SSE_EPILOGUE
5754 EPILOGUE_4_ARGS
5755ENDPROC iemAImpl_cvtsd2si_i64_r64
5756
5757
5758;;
5759; cvttss2si instruction - 32-bit variant.
5760;
5761; @param A0 FPU context (FXSTATE or XSAVEAREA).
5762; @param A1 Where to return the MXCSR value.
5763; @param A2 Pointer to the result operand (output).
5764; @param A3 Pointer to the second operand (input).
5765;
5766BEGINPROC_FASTCALL iemAImpl_cvttss2si_i32_r32, 16
5767 PROLOGUE_4_ARGS
5768 IEMIMPL_SSE_PROLOGUE
5769 SSE_LD_FXSTATE_MXCSR A0
5770
5771 cvttss2si T0_32, [A3]
5772 mov dword [A2], T0_32
5773
5774 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5775 IEMIMPL_SSE_EPILOGUE
5776 EPILOGUE_4_ARGS
5777ENDPROC iemAImpl_cvttss2si_i32_r32
5778
5779;;
5780; cvttss2si instruction - 64-bit variant.
5781;
5782; @param A0 FPU context (FXSTATE or XSAVEAREA).
5783; @param A1 Where to return the MXCSR value.
5784; @param A2 Pointer to the result operand (output).
5785; @param A3 Pointer to the second operand (input).
5786;
5787BEGINPROC_FASTCALL iemAImpl_cvttss2si_i64_r32, 16
5788 PROLOGUE_4_ARGS
5789 IEMIMPL_SSE_PROLOGUE
5790 SSE_LD_FXSTATE_MXCSR A0
5791
5792 cvttss2si T0, [A3]
5793 mov qword [A2], T0
5794
5795 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5796 IEMIMPL_SSE_EPILOGUE
5797 EPILOGUE_4_ARGS
5798ENDPROC iemAImpl_cvttss2si_i64_r32
5799
5800
5801;;
5802; cvtss2si instruction - 32-bit variant.
5803;
5804; @param A0 FPU context (FXSTATE or XSAVEAREA).
5805; @param A1 Where to return the MXCSR value.
5806; @param A2 Pointer to the result operand (output).
5807; @param A3 Pointer to the second operand (input).
5808;
5809BEGINPROC_FASTCALL iemAImpl_cvtss2si_i32_r32, 16
5810 PROLOGUE_4_ARGS
5811 IEMIMPL_SSE_PROLOGUE
5812 SSE_LD_FXSTATE_MXCSR A0
5813
5814 cvtss2si T0_32, [A3]
5815 mov dword [A2], T0_32
5816
5817 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5818 IEMIMPL_SSE_EPILOGUE
5819 EPILOGUE_4_ARGS
5820ENDPROC iemAImpl_cvtss2si_i32_r32
5821
5822;;
5823; cvtss2si instruction - 64-bit variant.
5824;
5825; @param A0 FPU context (FXSTATE or XSAVEAREA).
5826; @param A1 Where to return the MXCSR value.
5827; @param A2 Pointer to the result operand (output).
5828; @param A3 Pointer to the second operand (input).
5829;
5830BEGINPROC_FASTCALL iemAImpl_cvtss2si_i64_r32, 16
5831 PROLOGUE_4_ARGS
5832 IEMIMPL_SSE_PROLOGUE
5833 SSE_LD_FXSTATE_MXCSR A0
5834
5835 cvtss2si T0, [A3]
5836 mov qword [A2], T0
5837
5838 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5839 IEMIMPL_SSE_EPILOGUE
5840 EPILOGUE_4_ARGS
5841ENDPROC iemAImpl_cvtss2si_i64_r32
5842
5843
5844;;
5845; cvtsi2ss instruction - 32-bit variant.
5846;
5847; @param A0 FPU context (FXSTATE or XSAVEAREA).
5848; @param A1 Where to return the MXCSR value.
5849; @param A2 Pointer to the result operand (output).
5850; @param A3 Pointer to the second operand (input).
5851;
5852BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i32, 16
5853 PROLOGUE_4_ARGS
5854 IEMIMPL_SSE_PROLOGUE
5855 SSE_LD_FXSTATE_MXCSR A0
5856
5857 cvtsi2ss xmm0, dword [A3]
5858 movd dword [A2], xmm0
5859
5860 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5861 IEMIMPL_SSE_EPILOGUE
5862 EPILOGUE_4_ARGS
5863ENDPROC iemAImpl_cvtsi2ss_r32_i32
5864
5865;;
5866; cvtsi2ss instruction - 64-bit variant.
5867;
5868; @param A0 FPU context (FXSTATE or XSAVEAREA).
5869; @param A1 Where to return the MXCSR value.
5870; @param A2 Pointer to the result operand (output).
5871; @param A3 Pointer to the second operand (input).
5872;
5873BEGINPROC_FASTCALL iemAImpl_cvtsi2ss_r32_i64, 16
5874 PROLOGUE_4_ARGS
5875 IEMIMPL_SSE_PROLOGUE
5876 SSE_LD_FXSTATE_MXCSR A0
5877
5878 cvtsi2ss xmm0, qword [A3]
5879 movd dword [A2], xmm0
5880
5881 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5882 IEMIMPL_SSE_EPILOGUE
5883 EPILOGUE_4_ARGS
5884ENDPROC iemAImpl_cvtsi2ss_r32_i64
5885
5886
5887;;
5888; cvtsi2sd instruction - 32-bit variant.
5889;
5890; @param A0 FPU context (FXSTATE or XSAVEAREA).
5891; @param A1 Where to return the MXCSR value.
5892; @param A2 Pointer to the result operand (output).
5893; @param A3 Pointer to the second operand (input).
5894;
5895BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i32, 16
5896 PROLOGUE_4_ARGS
5897 IEMIMPL_SSE_PROLOGUE
5898 SSE_LD_FXSTATE_MXCSR A0
5899
5900 cvtsi2sd xmm0, dword [A3]
5901 movq [A2], xmm0
5902
5903 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5904 IEMIMPL_SSE_EPILOGUE
5905 EPILOGUE_4_ARGS
5906ENDPROC iemAImpl_cvtsi2sd_r64_i32
5907
5908;;
5909; cvtsi2sd instruction - 64-bit variant.
5910;
5911; @param A0 FPU context (FXSTATE or XSAVEAREA).
5912; @param A1 Where to return the MXCSR value.
5913; @param A2 Pointer to the result operand (output).
5914; @param A3 Pointer to the second operand (input).
5915;
5916BEGINPROC_FASTCALL iemAImpl_cvtsi2sd_r64_i64, 16
5917 PROLOGUE_4_ARGS
5918 IEMIMPL_SSE_PROLOGUE
5919 SSE_LD_FXSTATE_MXCSR A0
5920
5921 cvtsi2sd xmm0, qword [A3]
5922 movq [A2], xmm0
5923
5924 SSE_ST_FXSTATE_MXCSR_ONLY A1, A0
5925 IEMIMPL_SSE_EPILOGUE
5926 EPILOGUE_4_ARGS
5927ENDPROC iemAImpl_cvtsi2sd_r64_i64
5928
5929
5930;;
5931; Initialize the SSE MXCSR register using the guest value partially to
5932; account for rounding mode.
5933;
5934; @uses 4 bytes of stack to save the original value, T0.
5935; @param 1 Expression giving the address of the MXCSR register of the guest.
5936;
5937%macro SSE_LD_FXSTATE_MXCSR_ONLY 1
5938 sub xSP, 4
5939
5940 stmxcsr [xSP]
5941 mov T0_32, [%1]
5942 and T0_32, X86_MXCSR_FZ | X86_MXCSR_RC_MASK | X86_MXCSR_DAZ
5943 or T0_32, X86_MXCSR_XCPT_MASK
5944 sub xSP, 4
5945 mov [xSP], T0_32
5946 ldmxcsr [xSP]
5947 add xSP, 4
5948%endmacro
5949
5950
5951;;
5952; Restores the SSE MXCSR register with the original value.
5953;
5954; @uses 4 bytes of stack to save the content of MXCSR value, T0, T1.
5955; @param 1 Expression giving the address where to return the MXCSR value - only the MXCSR is stored, no IEMSSERESULT is used.
5956;
5957; @note Restores the stack pointer.
5958;
5959%macro SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE 1
5960 sub xSP, 4
5961 stmxcsr [xSP]
5962 mov T0_32, [xSP]
5963 add xSP, 4
5964 ; Merge the status bits into the original MXCSR value.
5965 mov T1_32, [%1]
5966 and T0_32, X86_MXCSR_XCPT_FLAGS
5967 or T0_32, T1_32
5968 mov [%1], T0_32
5969
5970 ldmxcsr [xSP]
5971 add xSP, 4
5972%endmacro
5973
5974
5975;
5976; UCOMISS (SSE)
5977;
5978; @param A0 Pointer to the MXCSR value (input/output).
5979; @param A1 Pointer to the EFLAGS value (input/output).
5980; @param A2 Pointer to the first source operand (aka readonly destination).
5981; @param A3 Pointer to the second source operand.
5982;
5983BEGINPROC_FASTCALL iemAImpl_ucomiss_u128, 16
5984 PROLOGUE_4_ARGS
5985 IEMIMPL_SSE_PROLOGUE
5986 SSE_LD_FXSTATE_MXCSR_ONLY A0
5987
5988 movdqu xmm0, [A2]
5989 movdqu xmm1, [A3]
5990 ucomiss xmm0, xmm1
5991 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
5992
5993 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
5994 IEMIMPL_SSE_EPILOGUE
5995 EPILOGUE_4_ARGS
5996ENDPROC iemAImpl_ucomiss_u128
5997
5998BEGINPROC_FASTCALL iemAImpl_vucomiss_u128, 16
5999 PROLOGUE_4_ARGS
6000 IEMIMPL_SSE_PROLOGUE
6001 SSE_LD_FXSTATE_MXCSR_ONLY A0
6002
6003 movdqu xmm0, [A2]
6004 movdqu xmm1, [A3]
6005 vucomiss xmm0, xmm1
6006 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6007
6008 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6009 IEMIMPL_SSE_EPILOGUE
6010 EPILOGUE_4_ARGS
6011ENDPROC iemAImpl_vucomiss_u128
6012
6013
6014;
6015; UCOMISD (SSE)
6016;
6017; @param A0 Pointer to the MXCSR value (input/output).
6018; @param A1 Pointer to the EFLAGS value (input/output).
6019; @param A2 Pointer to the first source operand (aka readonly destination).
6020; @param A3 Pointer to the second source operand.
6021;
6022BEGINPROC_FASTCALL iemAImpl_ucomisd_u128, 16
6023 PROLOGUE_4_ARGS
6024 IEMIMPL_SSE_PROLOGUE
6025 SSE_LD_FXSTATE_MXCSR_ONLY A0
6026
6027 movdqu xmm0, [A2]
6028 movdqu xmm1, [A3]
6029 ucomisd xmm0, xmm1
6030 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6031
6032 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6033 IEMIMPL_SSE_EPILOGUE
6034 EPILOGUE_4_ARGS
6035ENDPROC iemAImpl_ucomisd_u128
6036
6037BEGINPROC_FASTCALL iemAImpl_vucomisd_u128, 16
6038 PROLOGUE_4_ARGS
6039 IEMIMPL_SSE_PROLOGUE
6040 SSE_LD_FXSTATE_MXCSR_ONLY A0
6041
6042 movdqu xmm0, [A2]
6043 movdqu xmm1, [A3]
6044 vucomisd xmm0, xmm1
6045 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6046
6047 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6048 IEMIMPL_SSE_EPILOGUE
6049 EPILOGUE_4_ARGS
6050ENDPROC iemAImpl_vucomisd_u128
6051
6052;
6053; COMISS (SSE)
6054;
6055; @param A0 Pointer to the MXCSR value (input/output).
6056; @param A1 Pointer to the EFLAGS value (input/output).
6057; @param A2 Pointer to the first source operand (aka readonly destination).
6058; @param A3 Pointer to the second source operand.
6059;
6060BEGINPROC_FASTCALL iemAImpl_comiss_u128, 16
6061 PROLOGUE_4_ARGS
6062 IEMIMPL_SSE_PROLOGUE
6063 SSE_LD_FXSTATE_MXCSR_ONLY A0
6064
6065 movdqu xmm0, [A2]
6066 movdqu xmm1, [A3]
6067 comiss xmm0, xmm1
6068 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6069
6070 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6071 IEMIMPL_SSE_EPILOGUE
6072 EPILOGUE_4_ARGS
6073ENDPROC iemAImpl_comiss_u128
6074
6075BEGINPROC_FASTCALL iemAImpl_vcomiss_u128, 16
6076 PROLOGUE_4_ARGS
6077 IEMIMPL_SSE_PROLOGUE
6078 SSE_LD_FXSTATE_MXCSR_ONLY A0
6079
6080 movdqu xmm0, [A2]
6081 movdqu xmm1, [A3]
6082 vcomiss xmm0, xmm1
6083 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6084
6085 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6086 IEMIMPL_SSE_EPILOGUE
6087 EPILOGUE_4_ARGS
6088ENDPROC iemAImpl_vcomiss_u128
6089
6090
6091;
6092; COMISD (SSE)
6093;
6094; @param A0 Pointer to the MXCSR value (input/output).
6095; @param A1 Pointer to the EFLAGS value (input/output).
6096; @param A2 Pointer to the first source operand (aka readonly destination).
6097; @param A3 Pointer to the second source operand.
6098;
6099BEGINPROC_FASTCALL iemAImpl_comisd_u128, 16
6100 PROLOGUE_4_ARGS
6101 IEMIMPL_SSE_PROLOGUE
6102 SSE_LD_FXSTATE_MXCSR_ONLY A0
6103
6104 movdqu xmm0, [A2]
6105 movdqu xmm1, [A3]
6106 comisd xmm0, xmm1
6107 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6108
6109 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6110 IEMIMPL_SSE_EPILOGUE
6111 EPILOGUE_4_ARGS
6112ENDPROC iemAImpl_comisd_u128
6113
6114BEGINPROC_FASTCALL iemAImpl_vcomisd_u128, 16
6115 PROLOGUE_4_ARGS
6116 IEMIMPL_SSE_PROLOGUE
6117 SSE_LD_FXSTATE_MXCSR_ONLY A0
6118
6119 movdqu xmm0, [A2]
6120 movdqu xmm1, [A3]
6121 vcomisd xmm0, xmm1
6122 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6123
6124 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6125 IEMIMPL_SSE_EPILOGUE
6126 EPILOGUE_4_ARGS
6127ENDPROC iemAImpl_vcomisd_u128
6128
6129
6130;;
6131; Need to move this as well somewhere better?
6132;
6133struc IEMMEDIAF2XMMSRC
6134 .uSrc1 resd 4
6135 .uSrc2 resd 4
6136endstruc
6137
6138
6139;
6140; CMPPS (SSE)
6141;
6142; @param A0 Pointer to the MXCSR value (input/output).
6143; @param A1 Pointer to the first media register size operand (output).
6144; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6145; @param A3 The 8-bit immediate (input).
6146;
6147BEGINPROC_FASTCALL iemAImpl_cmpps_u128, 16
6148 PROLOGUE_4_ARGS
6149 IEMIMPL_SSE_PROLOGUE
6150 SSE_LD_FXSTATE_MXCSR_ONLY A0
6151
6152 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6153 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6154 lea T0, [A3 + A3*4] ; sizeof(cmpps+ret) == 5
6155 lea T1, [.imm0 xWrtRIP]
6156 lea T1, [T1 + T0]
6157 call T1
6158 movdqu [A1], xmm0
6159
6160 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6161 IEMIMPL_SSE_EPILOGUE
6162 EPILOGUE_4_ARGS
6163 %assign bImm 0
6164 %rep 256
6165.imm %+ bImm:
6166 cmpps xmm0, xmm1, bImm
6167 ret
6168 %assign bImm bImm + 1
6169 %endrep
6170.immEnd: ; 256*5 == 0x500
6171dw 0xfaff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6172dw 0x104ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6173ENDPROC iemAImpl_cmpps_u128
6174
6175;;
6176; SSE instructions with 8-bit immediates of the form
6177; xxx xmm1, xmm2, imm8.
6178; where the instruction encoding takes up 5 bytes and we need to load and save the MXCSR
6179; register.
6180;
6181; @param 1 The instruction name.
6182;
6183; @param A0 Pointer to the MXCSR value (input/output).
6184; @param A1 Pointer to the first media register size operand (output).
6185; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6186; @param A3 The 8-bit immediate (input).
6187;
6188%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 1
6189BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6190 PROLOGUE_4_ARGS
6191 IEMIMPL_SSE_PROLOGUE
6192 SSE_LD_FXSTATE_MXCSR_ONLY A0
6193
6194 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6195 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6196 lea T1, [.imm0 xWrtRIP]
6197 lea T0, [A3 + A3*2] ; sizeof(pshufXX+ret) == 6: (A3 * 3) *2
6198 lea T1, [T1 + T0*2]
6199 call T1
6200 movdqu [A1], xmm0
6201
6202 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6203 IEMIMPL_SSE_EPILOGUE
6204 EPILOGUE_4_ARGS
6205 %assign bImm 0
6206 %rep 256
6207.imm %+ bImm:
6208 %1 xmm0, xmm1, bImm
6209 ret
6210 %assign bImm bImm + 1
6211 %endrep
6212.immEnd: ; 256*6 == 0x600
6213dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6214dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6215ENDPROC iemAImpl_ %+ %1 %+ _u128
6216%endmacro
6217
6218IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmppd
6219IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpss
6220IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_5 cmpsd
6221
6222;;
6223; SSE instructions with 8-bit immediates of the form
6224; xxx xmm1, xmm2, imm8.
6225; where the instruction encoding takes up 6 bytes and we need to load and save the MXCSR
6226; register.
6227;
6228; @param 1 The instruction name.
6229;
6230; @param A0 Pointer to the MXCSR value (input/output).
6231; @param A1 Pointer to the first media register size operand (output).
6232; @param A2 Pointer to the two media register sized inputs - IEMMEDIAF2XMMSRC (input).
6233; @param A3 The 8-bit immediate (input).
6234;
6235%macro IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 1
6236BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6237 PROLOGUE_4_ARGS
6238 IEMIMPL_SSE_PROLOGUE
6239 SSE_LD_FXSTATE_MXCSR_ONLY A0
6240
6241 movdqu xmm0, [A2 + IEMMEDIAF2XMMSRC.uSrc1]
6242 movdqu xmm1, [A2 + IEMMEDIAF2XMMSRC.uSrc2]
6243 lea T1, [.imm0 xWrtRIP]
6244 lea T0, [A3*2 + A3] ; sizeof(insn+ret) == 7: 2 * (A3 * 3) + A3
6245 lea T0, [T0*2]
6246 lea T0, [T0 + A3]
6247 lea T1, [T1 + T0]
6248 call T1
6249 movdqu [A1], xmm0
6250
6251 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6252 IEMIMPL_SSE_EPILOGUE
6253 EPILOGUE_4_ARGS
6254 %assign bImm 0
6255 %rep 256
6256.imm %+ bImm:
6257 %1 xmm0, xmm1, bImm
6258 ret
6259 %assign bImm bImm + 1
6260 %endrep
6261.immEnd: ; 256*(6+1) == 0x700
6262dw 0xf8ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6263dw 0x106ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6264ENDPROC iemAImpl_ %+ %1 %+ _u128
6265%endmacro
6266
6267IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundps
6268IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundpd
6269IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundss
6270IEMIMPL_MEDIA_SSE_INSN_IMM8_MXCSR_6 roundsd
6271
6272
6273;;
6274; SSE instructions of the form
6275; xxx mm, xmm.
6276; and we need to load and save the MXCSR register.
6277;
6278; @param 1 The instruction name.
6279;
6280; @param A0 Pointer to the MXCSR value (input/output).
6281; @param A1 Pointer to the first MMX register sized operand (output).
6282; @param A2 Pointer to the media register sized operand (input).
6283;
6284%macro IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 1
6285BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6286 PROLOGUE_3_ARGS
6287 IEMIMPL_SSE_PROLOGUE
6288 SSE_LD_FXSTATE_MXCSR_ONLY A0
6289
6290 movdqu xmm0, [A2]
6291 %1 mm0, xmm0
6292 movq [A1], mm0
6293
6294 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6295 IEMIMPL_SSE_EPILOGUE
6296 EPILOGUE_3_ARGS
6297ENDPROC iemAImpl_ %+ %1 %+ _u128
6298%endmacro
6299
6300IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvtpd2pi
6301IEMIMPL_MEDIA_SSE_MXCSR_I64_U128 cvttpd2pi
6302
6303;;
6304; SSE instructions of the form
6305; xxx xmm, xmm/m64.
6306; and we need to load and save the MXCSR register.
6307;
6308; @param 1 The instruction name.
6309;
6310; @param A0 Pointer to the MXCSR value (input/output).
6311; @param A1 Pointer to the first media register sized operand (input/output).
6312; @param A2 The 64bit source value from a MMX media register (input)
6313;
6314%macro IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 1
6315BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6316 PROLOGUE_3_ARGS
6317 IEMIMPL_SSE_PROLOGUE
6318 SSE_LD_FXSTATE_MXCSR_ONLY A0
6319
6320 movdqu xmm0, [A1]
6321 movq mm0, A2
6322 %1 xmm0, mm0
6323 movdqu [A1], xmm0
6324
6325 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6326 IEMIMPL_SSE_EPILOGUE
6327 EPILOGUE_3_ARGS
6328ENDPROC iemAImpl_ %+ %1 %+ _u128
6329%endmacro
6330
6331IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2ps
6332IEMIMPL_MEDIA_SSE_MXCSR_U128_U64 cvtpi2pd
6333
6334;;
6335; SSE instructions of the form
6336; xxx mm, xmm/m64.
6337; and we need to load and save the MXCSR register.
6338;
6339; @param 1 The instruction name.
6340;
6341; @param A0 Pointer to the MXCSR value (input/output).
6342; @param A1 Pointer to the first MMX media register sized operand (output).
6343; @param A2 The 64bit source value (input).
6344;
6345%macro IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 1
6346BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u128, 16
6347 PROLOGUE_3_ARGS
6348 IEMIMPL_SSE_PROLOGUE
6349 SSE_LD_FXSTATE_MXCSR_ONLY A0
6350
6351 movq xmm0, A2
6352 %1 mm0, xmm0
6353 movq [A1], mm0
6354
6355 SSE_ST_FXSTATE_MXCSR_ONLY_NO_FXSTATE A0
6356 IEMIMPL_SSE_EPILOGUE
6357 EPILOGUE_3_ARGS
6358ENDPROC iemAImpl_ %+ %1 %+ _u128
6359%endmacro
6360
6361IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvtps2pi
6362IEMIMPL_MEDIA_SSE_MXCSR_U64_U64 cvttps2pi
6363
6364;
6365; All forms of RDRAND and RDSEED
6366;
6367; @param A0 Pointer to the destination operand.
6368; @param A1 Pointer to the EFLAGS value (input/output).
6369;
6370%macro IEMIMPL_RDRAND_RDSEED 3
6371BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u %+ %3, 8
6372 PROLOGUE_2_ARGS
6373
6374 %1 %2
6375 mov [A0], %2
6376 IEM_SAVE_FLAGS A1, X86_EFL_STATUS_BITS, 0
6377
6378 EPILOGUE_2_ARGS
6379ENDPROC iemAImpl_ %+ %1 %+ _u %+ %3
6380%endmacro
6381
6382IEMIMPL_RDRAND_RDSEED rdrand, ax, 16
6383IEMIMPL_RDRAND_RDSEED rdrand, eax, 32
6384IEMIMPL_RDRAND_RDSEED rdrand, rax, 64
6385IEMIMPL_RDRAND_RDSEED rdseed, ax, 16
6386IEMIMPL_RDRAND_RDSEED rdseed, eax, 32
6387IEMIMPL_RDRAND_RDSEED rdseed, rax, 64
6388
6389
6390;;
6391; sha1rnds4 xmm1, xmm2, imm8.
6392;
6393; @param 1 The instruction name.
6394;
6395; @param A0 Pointer to the first media register size operand (input/output).
6396; @param A1 Pointer to the second source media register size operand (input).
6397; @param A2 The 8-bit immediate
6398;
6399BEGINPROC_FASTCALL iemAImpl_sha1rnds4_u128, 16
6400 PROLOGUE_3_ARGS
6401 IEMIMPL_SSE_PROLOGUE
6402
6403 movdqu xmm0, [A0]
6404 movdqu xmm1, [A1]
6405 lea T1, [.imm0 xWrtRIP]
6406 lea T0, [A2 + A2*2] ; sizeof(insnX+ret) == 6: (A2 * 3) * 2
6407 lea T1, [T1 + T0*2]
6408 call T1
6409 movdqu [A0], xmm0
6410
6411 IEMIMPL_SSE_EPILOGUE
6412 EPILOGUE_3_ARGS
6413 %assign bImm 0
6414 %rep 256
6415.imm %+ bImm:
6416 sha1rnds4 xmm0, xmm1, bImm
6417 ret
6418 %assign bImm bImm + 1
6419 %endrep
6420.immEnd: ; 256*6 == 0x600
6421dw 0xf9ff + (.immEnd - .imm0) ; will cause warning if entries are too big.
6422dw 0x105ff - (.immEnd - .imm0) ; will cause warning if entries are too small.
6423ENDPROC iemAImpl_sha1rnds4_u128
6424
6425
6426;;
6427; sha256rnds2 xmm1, xmm2, <XMM0>.
6428;
6429; @param 1 The instruction name.
6430;
6431; @param A0 Pointer to the first media register size operand (input/output).
6432; @param A1 Pointer to the second source media register size operand (input).
6433; @param A2 Pointer to the implicit XMM0 constants (input).
6434;
6435BEGINPROC_FASTCALL iemAImpl_sha256rnds2_u128, 16
6436 PROLOGUE_3_ARGS
6437 IEMIMPL_SSE_PROLOGUE
6438
6439 movdqu xmm0, [A2]
6440 movdqu xmm1, [A0]
6441 movdqu xmm2, [A1]
6442 sha256rnds2 xmm1, xmm2
6443 movdqu [A0], xmm1
6444
6445 IEMIMPL_SSE_EPILOGUE
6446 EPILOGUE_3_ARGS
6447ENDPROC iemAImpl_sha256rnds2_u128
6448
6449
6450;
6451; 32-bit forms of ADCX and ADOX
6452;
6453; @param A0 Pointer to the destination operand (input/output).
6454; @param A1 Pointer to the EFLAGS value (input/output).
6455; @param A2 32-bit source operand 1 (input).
6456;
6457%macro IEMIMPL_ADX_32 2
6458BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u32, 8
6459 PROLOGUE_4_ARGS
6460
6461 IEM_LOAD_FLAGS A1, %2, 0
6462 %1 A2_32, [A0]
6463 mov [A0], A2_32
6464 IEM_SAVE_FLAGS A1, %2, 0
6465
6466 EPILOGUE_4_ARGS
6467ENDPROC iemAImpl_ %+ %1 %+ _u32
6468%endmacro
6469
6470;
6471; 64-bit forms of ADCX and ADOX
6472;
6473; @param A0 Pointer to the destination operand (input/output).
6474; @param A1 Pointer to the EFLAGS value (input/output).
6475; @param A2 64-bit source operand 1 (input).
6476;
6477%macro IEMIMPL_ADX_64 2
6478BEGINPROC_FASTCALL iemAImpl_ %+ %1 %+ _u64, 8
6479 PROLOGUE_4_ARGS
6480
6481 IEM_LOAD_FLAGS A1, %2, 0
6482 %1 A2, [A0]
6483 mov [A0], A2
6484 IEM_SAVE_FLAGS A1, %2, 0
6485
6486 EPILOGUE_4_ARGS
6487ENDPROC iemAImpl_ %+ %1 %+ _u64
6488%endmacro
6489
6490IEMIMPL_ADX_32 adcx, X86_EFL_CF
6491IEMIMPL_ADX_64 adcx, X86_EFL_CF
6492
6493IEMIMPL_ADX_32 adox, X86_EFL_OF
6494IEMIMPL_ADX_64 adox, X86_EFL_OF
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette