VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllN8veRecompiler.cpp@ 101387

最後變更 在這個檔案從101387是 101387,由 vboxsync 提交於 18 月 前

VMM/IEM: Added a new class of threaded function variants, the 16f/32f/64f variants that will clear RF (and vbox internal friends) and check for TF (and vbox internal friends). The variants w/o the 'f' after the bitcount will skip this test+branch. The motivation of this was to deal with this issue that the threaded recompiler level rather than try optimize away the test+branch++ code when generating native code, make the IEM_MC_ADVANCE_RIP_AND_FINISH_THREADED_PC32 a very simple place to start emitting native code (compared to IEM_MC_ADVANCE_RIP_AND_FINISH_THREADED_PC32_WITH_FLAGS). bugref:10371

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 83.8 KB
 
1/* $Id: IEMAllN8veRecompiler.cpp 101387 2023-10-07 23:34:54Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler
4 *
5 * Logging group IEM_RE_NATIVE assignments:
6 * - Level 1 (Log) : ...
7 * - Flow (LogFlow) : ...
8 * - Level 2 (Log2) : ...
9 * - Level 3 (Log3) : ...
10 * - Level 4 (Log4) : ...
11 * - Level 5 (Log5) : ...
12 * - Level 6 (Log6) : ...
13 * - Level 7 (Log7) : ...
14 * - Level 8 (Log8) : ...
15 * - Level 9 (Log9) : ...
16 * - Level 10 (Log10): ...
17 * - Level 11 (Log11): ...
18 * - Level 12 (Log12): ...
19 */
20
21/*
22 * Copyright (C) 2023 Oracle and/or its affiliates.
23 *
24 * This file is part of VirtualBox base platform packages, as
25 * available from https://www.alldomusa.eu.org.
26 *
27 * This program is free software; you can redistribute it and/or
28 * modify it under the terms of the GNU General Public License
29 * as published by the Free Software Foundation, in version 3 of the
30 * License.
31 *
32 * This program is distributed in the hope that it will be useful, but
33 * WITHOUT ANY WARRANTY; without even the implied warranty of
34 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
35 * General Public License for more details.
36 *
37 * You should have received a copy of the GNU General Public License
38 * along with this program; if not, see <https://www.gnu.org/licenses>.
39 *
40 * SPDX-License-Identifier: GPL-3.0-only
41 */
42
43
44/*********************************************************************************************************************************
45* Header Files *
46*********************************************************************************************************************************/
47#define LOG_GROUP LOG_GROUP_IEM_RE_THREADED
48#define IEM_WITH_OPAQUE_DECODER_STATE
49#define VMCPU_INCL_CPUM_GST_CTX
50#define VMM_INCLUDED_SRC_include_IEMMc_h /* block IEMMc.h inclusion. */
51#include <VBox/vmm/iem.h>
52#include <VBox/vmm/cpum.h>
53#include "IEMInternal.h"
54#include <VBox/vmm/vmcc.h>
55#include <VBox/log.h>
56#include <VBox/err.h>
57#include <VBox/param.h>
58#include <iprt/assert.h>
59#include <iprt/heap.h>
60#include <iprt/mem.h>
61#include <iprt/string.h>
62#if defined(RT_ARCH_AMD64)
63# include <iprt/x86.h>
64#elif defined(RT_ARCH_ARM64)
65# include <iprt/armv8.h>
66#endif
67
68#ifdef RT_OS_WINDOWS
69# include <iprt/formats/pecoff.h> /* this is incomaptible with windows.h, thus: */
70extern "C" DECLIMPORT(uint8_t) __cdecl RtlAddFunctionTable(void *pvFunctionTable, uint32_t cEntries, uintptr_t uBaseAddress);
71extern "C" DECLIMPORT(uint8_t) __cdecl RtlDelFunctionTable(void *pvFunctionTable);
72#else
73# include <iprt/formats/dwarf.h>
74# if defined(RT_OS_DARWIN)
75# include <libkern/OSCacheControl.h>
76# define IEMNATIVE_USE_LIBUNWIND
77extern "C" void __register_frame(const void *pvFde);
78extern "C" void __deregister_frame(const void *pvFde);
79# else
80extern "C" void __register_frame_info(void *pvBegin, void *pvObj); /* found no header for these two */
81extern "C" void *__deregister_frame_info(void *pvBegin); /* (returns pvObj from __register_frame_info call) */
82# endif
83#endif
84
85#include "IEMInline.h"
86#include "IEMThreadedFunctions.h"
87#include "IEMN8veRecompiler.h"
88#include "IEMNativeFunctions.h"
89
90
91/*
92 * Narrow down configs here to avoid wasting time on unused configs here.
93 * Note! Same checks in IEMAllThrdRecompiler.cpp.
94 */
95
96#ifndef IEM_WITH_CODE_TLB
97# error The code TLB must be enabled for the recompiler.
98#endif
99
100#ifndef IEM_WITH_DATA_TLB
101# error The data TLB must be enabled for the recompiler.
102#endif
103
104#ifndef IEM_WITH_SETJMP
105# error The setjmp approach must be enabled for the recompiler.
106#endif
107
108
109/*********************************************************************************************************************************
110* Executable Memory Allocator *
111*********************************************************************************************************************************/
112/** @def IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
113 * Use an alternative chunk sub-allocator that does store internal data
114 * in the chunk.
115 *
116 * Using the RTHeapSimple is not practial on newer darwin systems where
117 * RTMEM_PROT_WRITE and RTMEM_PROT_EXEC are mutually exclusive in process
118 * memory. We would have to change the protection of the whole chunk for
119 * every call to RTHeapSimple, which would be rather expensive.
120 *
121 * This alternative implemenation let restrict page protection modifications
122 * to the pages backing the executable memory we just allocated.
123 */
124#define IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
125/** The chunk sub-allocation unit size in bytes. */
126#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE 128
127/** The chunk sub-allocation unit size as a shift factor. */
128#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT 7
129
130#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
131/**
132 * Per-chunk unwind info for non-windows hosts.
133 */
134typedef struct IEMEXECMEMCHUNKEHFRAME
135{
136# ifdef IEMNATIVE_USE_LIBUNWIND
137 /** The offset of the FDA into abEhFrame. */
138 uintptr_t offFda;
139# else
140 /** struct object storage area. */
141 uint8_t abObject[1024];
142# endif
143 /** The dwarf ehframe data for the chunk. */
144 uint8_t abEhFrame[512];
145} IEMEXECMEMCHUNKEHFRAME;
146/** Pointer to per-chunk info info for non-windows hosts. */
147typedef IEMEXECMEMCHUNKEHFRAME *PIEMEXECMEMCHUNKEHFRAME;
148#endif
149
150
151/**
152 * An chunk of executable memory.
153 */
154typedef struct IEMEXECMEMCHUNK
155{
156#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
157 /** Number of free items in this chunk. */
158 uint32_t cFreeUnits;
159 /** Hint were to start searching for free space in the allocation bitmap. */
160 uint32_t idxFreeHint;
161#else
162 /** The heap handle. */
163 RTHEAPSIMPLE hHeap;
164#endif
165 /** Pointer to the chunk. */
166 void *pvChunk;
167#ifdef IN_RING3
168 /**
169 * Pointer to the unwind information.
170 *
171 * This is used during C++ throw and longjmp (windows and probably most other
172 * platforms). Some debuggers (windbg) makes use of it as well.
173 *
174 * Windows: This is allocated from hHeap on windows because (at least for
175 * AMD64) the UNWIND_INFO structure address in the
176 * RUNTIME_FUNCTION entry is an RVA and the chunk is the "image".
177 *
178 * Others: Allocated from the regular heap to avoid unnecessary executable data
179 * structures. This points to an IEMEXECMEMCHUNKEHFRAME structure. */
180 void *pvUnwindInfo;
181#elif defined(IN_RING0)
182 /** Allocation handle. */
183 RTR0MEMOBJ hMemObj;
184#endif
185} IEMEXECMEMCHUNK;
186/** Pointer to a memory chunk. */
187typedef IEMEXECMEMCHUNK *PIEMEXECMEMCHUNK;
188
189
190/**
191 * Executable memory allocator for the native recompiler.
192 */
193typedef struct IEMEXECMEMALLOCATOR
194{
195 /** Magic value (IEMEXECMEMALLOCATOR_MAGIC). */
196 uint32_t uMagic;
197
198 /** The chunk size. */
199 uint32_t cbChunk;
200 /** The maximum number of chunks. */
201 uint32_t cMaxChunks;
202 /** The current number of chunks. */
203 uint32_t cChunks;
204 /** Hint where to start looking for available memory. */
205 uint32_t idxChunkHint;
206 /** Statistics: Current number of allocations. */
207 uint32_t cAllocations;
208
209 /** The total amount of memory available. */
210 uint64_t cbTotal;
211 /** Total amount of free memory. */
212 uint64_t cbFree;
213 /** Total amount of memory allocated. */
214 uint64_t cbAllocated;
215
216#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
217 /** Pointer to the allocation bitmaps for all the chunks (follows aChunks).
218 *
219 * Since the chunk size is a power of two and the minimum chunk size is a lot
220 * higher than the IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE, each chunk will always
221 * require a whole number of uint64_t elements in the allocation bitmap. So,
222 * for sake of simplicity, they are allocated as one continous chunk for
223 * simplicity/laziness. */
224 uint64_t *pbmAlloc;
225 /** Number of units (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) per chunk. */
226 uint32_t cUnitsPerChunk;
227 /** Number of bitmap elements per chunk (for quickly locating the bitmap
228 * portion corresponding to an chunk). */
229 uint32_t cBitmapElementsPerChunk;
230#else
231 /** @name Tweaks to get 64 byte aligned allocats w/o unnecessary fragmentation.
232 * @{ */
233 /** The size of the heap internal block header. This is used to adjust the
234 * request memory size to make sure there is exacly enough room for a header at
235 * the end of the blocks we allocate before the next 64 byte alignment line. */
236 uint32_t cbHeapBlockHdr;
237 /** The size of initial heap allocation required make sure the first
238 * allocation is correctly aligned. */
239 uint32_t cbHeapAlignTweak;
240 /** The alignment tweak allocation address. */
241 void *pvAlignTweak;
242 /** @} */
243#endif
244
245#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
246 /** Pointer to the array of unwind info running parallel to aChunks (same
247 * allocation as this structure, located after the bitmaps).
248 * (For Windows, the structures must reside in 32-bit RVA distance to the
249 * actual chunk, so they are allocated off the chunk.) */
250 PIEMEXECMEMCHUNKEHFRAME paEhFrames;
251#endif
252
253 /** The allocation chunks. */
254 RT_FLEXIBLE_ARRAY_EXTENSION
255 IEMEXECMEMCHUNK aChunks[RT_FLEXIBLE_ARRAY];
256} IEMEXECMEMALLOCATOR;
257/** Pointer to an executable memory allocator. */
258typedef IEMEXECMEMALLOCATOR *PIEMEXECMEMALLOCATOR;
259
260/** Magic value for IEMEXECMEMALLOCATOR::uMagic (Scott Frederick Turow). */
261#define IEMEXECMEMALLOCATOR_MAGIC UINT32_C(0x19490412)
262
263
264static int iemExecMemAllocatorGrow(PIEMEXECMEMALLOCATOR pExecMemAllocator);
265
266
267/**
268 * Worker for iemExecMemAllocatorAlloc that returns @a pvRet after updating
269 * the heap statistics.
270 */
271static void * iemExecMemAllocatorAllocTailCode(PIEMEXECMEMALLOCATOR pExecMemAllocator, void *pvRet,
272 uint32_t cbReq, uint32_t idxChunk)
273{
274 pExecMemAllocator->cAllocations += 1;
275 pExecMemAllocator->cbAllocated += cbReq;
276#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
277 pExecMemAllocator->cbFree -= cbReq;
278#else
279 pExecMemAllocator->cbFree -= RT_ALIGN_32(cbReq, 64);
280#endif
281 pExecMemAllocator->idxChunkHint = idxChunk;
282
283#ifdef RT_OS_DARWIN
284 /*
285 * Sucks, but RTMEM_PROT_EXEC and RTMEM_PROT_WRITE are mutually exclusive
286 * on darwin. So, we mark the pages returned as read+write after alloc and
287 * expect the caller to call iemExecMemAllocatorReadyForUse when done
288 * writing to the allocation.
289 *
290 * See also https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
291 * for details.
292 */
293 /** @todo detect if this is necessary... it wasn't required on 10.15 or
294 * whatever older version it was. */
295 int rc = RTMemProtect(pvRet, cbReq, RTMEM_PROT_WRITE | RTMEM_PROT_READ);
296 AssertRC(rc);
297#endif
298
299 return pvRet;
300}
301
302
303#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
304static void *iemExecMemAllocatorAllocInChunkInt(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint64_t *pbmAlloc, uint32_t idxFirst,
305 uint32_t cToScan, uint32_t cReqUnits, uint32_t idxChunk)
306{
307 /*
308 * Shift the bitmap to the idxFirst bit so we can use ASMBitFirstClear.
309 */
310 Assert(!(cToScan & 63));
311 Assert(!(idxFirst & 63));
312 Assert(cToScan + idxFirst <= pExecMemAllocator->cUnitsPerChunk);
313 pbmAlloc += idxFirst / 64;
314
315 /*
316 * Scan the bitmap for cReqUnits of consequtive clear bits
317 */
318 /** @todo This can probably be done more efficiently for non-x86 systems. */
319 int iBit = ASMBitFirstClear(pbmAlloc, cToScan);
320 while (iBit >= 0 && (uint32_t)iBit <= cToScan - cReqUnits)
321 {
322 uint32_t idxAddBit = 1;
323 while (idxAddBit < cReqUnits && !ASMBitTest(pbmAlloc, (uint32_t)iBit + idxAddBit))
324 idxAddBit++;
325 if (idxAddBit >= cReqUnits)
326 {
327 ASMBitSetRange(pbmAlloc, (uint32_t)iBit, (uint32_t)iBit + cReqUnits);
328
329 PIEMEXECMEMCHUNK const pChunk = &pExecMemAllocator->aChunks[idxChunk];
330 pChunk->cFreeUnits -= cReqUnits;
331 pChunk->idxFreeHint = (uint32_t)iBit + cReqUnits;
332
333 void * const pvRet = (uint8_t *)pChunk->pvChunk
334 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
335
336 return iemExecMemAllocatorAllocTailCode(pExecMemAllocator, pvRet,
337 cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT, idxChunk);
338 }
339
340 iBit = ASMBitNextClear(pbmAlloc, cToScan, iBit + idxAddBit - 1);
341 }
342 return NULL;
343}
344#endif /* IEMEXECMEM_USE_ALT_SUB_ALLOCATOR */
345
346
347static void *iemExecMemAllocatorAllocInChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cbReq)
348{
349#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
350 /*
351 * Figure out how much to allocate.
352 */
353 uint32_t const cReqUnits = (cbReq + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1) >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
354 if (cReqUnits <= pExecMemAllocator->aChunks[idxChunk].cFreeUnits)
355 {
356 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
357 uint32_t const idxHint = pExecMemAllocator->aChunks[idxChunk].idxFreeHint & ~(uint32_t)63;
358 if (idxHint + cReqUnits <= pExecMemAllocator->cUnitsPerChunk)
359 {
360 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, idxHint,
361 pExecMemAllocator->cUnitsPerChunk - idxHint, cReqUnits, idxChunk);
362 if (pvRet)
363 return pvRet;
364 }
365 return iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, 0,
366 RT_MIN(pExecMemAllocator->cUnitsPerChunk, RT_ALIGN_32(idxHint + cReqUnits, 64)),
367 cReqUnits, idxChunk);
368 }
369#else
370 void *pvRet = RTHeapSimpleAlloc(pExecMemAllocator->aChunks[idxChunk].hHeap, cbReq, 32);
371 if (pvRet)
372 return iemExecMemAllocatorAllocTailCode(pExecMemAllocator, pvRet, cbReq, idxChunk);
373#endif
374 return NULL;
375
376}
377
378
379/**
380 * Allocates @a cbReq bytes of executable memory.
381 *
382 * @returns Pointer to the memory, NULL if out of memory or other problem
383 * encountered.
384 * @param pVCpu The cross context virtual CPU structure of the calling
385 * thread.
386 * @param cbReq How many bytes are required.
387 */
388static void *iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq)
389{
390 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
391 AssertReturn(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC, NULL);
392 AssertMsgReturn(cbReq > 32 && cbReq < _512K, ("%#x\n", cbReq), NULL);
393
394 /*
395 * Adjust the request size so it'll fit the allocator alignment/whatnot.
396 *
397 * For the RTHeapSimple allocator this means to follow the logic described
398 * in iemExecMemAllocatorGrow and attempt to allocate it from one of the
399 * existing chunks if we think we've got sufficient free memory around.
400 *
401 * While for the alternative one we just align it up to a whole unit size.
402 */
403#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
404 cbReq = RT_ALIGN_32(cbReq, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
405#else
406 cbReq = RT_ALIGN_32(cbReq + pExecMemAllocator->cbHeapBlockHdr, 64) - pExecMemAllocator->cbHeapBlockHdr;
407#endif
408 if (cbReq <= pExecMemAllocator->cbFree)
409 {
410 uint32_t const cChunks = pExecMemAllocator->cChunks;
411 uint32_t const idxChunkHint = pExecMemAllocator->idxChunkHint < cChunks ? pExecMemAllocator->idxChunkHint : 0;
412 for (uint32_t idxChunk = idxChunkHint; idxChunk < cChunks; idxChunk++)
413 {
414 void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq);
415 if (pvRet)
416 return pvRet;
417 }
418 for (uint32_t idxChunk = 0; idxChunk < idxChunkHint; idxChunk++)
419 {
420 void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq);
421 if (pvRet)
422 return pvRet;
423 }
424 }
425
426 /*
427 * Can we grow it with another chunk?
428 */
429 if (pExecMemAllocator->cChunks < pExecMemAllocator->cMaxChunks)
430 {
431 int rc = iemExecMemAllocatorGrow(pExecMemAllocator);
432 AssertLogRelRCReturn(rc, NULL);
433
434 uint32_t const idxChunk = pExecMemAllocator->cChunks - 1;
435 void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq);
436 if (pvRet)
437 return pvRet;
438 AssertFailed();
439 }
440
441 /* What now? Prune native translation blocks from the cache? */
442 AssertFailed();
443 return NULL;
444}
445
446
447/** This is a hook that we may need later for changing memory protection back
448 * to readonly+exec */
449static void iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb)
450{
451#ifdef RT_OS_DARWIN
452 /* See iemExecMemAllocatorAllocTailCode for the explanation. */
453 int rc = RTMemProtect(pv, cb, RTMEM_PROT_EXEC | RTMEM_PROT_READ);
454 AssertRC(rc); RT_NOREF(pVCpu);
455
456 /*
457 * Flush the instruction cache:
458 * https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
459 */
460 /* sys_dcache_flush(pv, cb); - not necessary */
461 sys_icache_invalidate(pv, cb);
462#else
463 RT_NOREF(pVCpu, pv, cb);
464#endif
465}
466
467
468/**
469 * Frees executable memory.
470 */
471void iemExecMemAllocatorFree(PVMCPU pVCpu, void *pv, size_t cb)
472{
473 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
474 Assert(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC);
475 Assert(pv);
476#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
477 Assert(!((uintptr_t)pv & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
478#else
479 Assert(!((uintptr_t)pv & 63));
480#endif
481
482 /* Align the size as we did when allocating the block. */
483#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
484 cb = RT_ALIGN_Z(cb, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
485#else
486 cb = RT_ALIGN_Z(cb + pExecMemAllocator->cbHeapBlockHdr, 64) - pExecMemAllocator->cbHeapBlockHdr;
487#endif
488
489 /* Free it / assert sanity. */
490#if defined(VBOX_STRICT) || defined(IEMEXECMEM_USE_ALT_SUB_ALLOCATOR)
491 uint32_t const cChunks = pExecMemAllocator->cChunks;
492 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
493 bool fFound = false;
494 for (uint32_t idxChunk = 0; idxChunk < cChunks; idxChunk++)
495 {
496 uintptr_t const offChunk = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunk;
497 fFound = offChunk < cbChunk;
498 if (fFound)
499 {
500#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
501 uint32_t const idxFirst = offChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
502 uint32_t const cReqUnits = cb >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
503
504 /* Check that it's valid and free it. */
505 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
506 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst));
507 for (uint32_t i = 1; i < cReqUnits; i++)
508 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst + i));
509 ASMBitClearRange(pbmAlloc, idxFirst, idxFirst + cReqUnits);
510
511 pExecMemAllocator->aChunks[idxChunk].cFreeUnits += cReqUnits;
512 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = idxFirst;
513
514 /* Update the stats. */
515 pExecMemAllocator->cbAllocated -= cb;
516 pExecMemAllocator->cbFree += cb;
517 pExecMemAllocator->cAllocations -= 1;
518 return;
519#else
520 Assert(RTHeapSimpleSize(pExecMemAllocator->aChunks[idxChunk].hHeap, pv) == cb);
521 break;
522#endif
523 }
524 }
525# ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
526 AssertFailed();
527# else
528 Assert(fFound);
529# endif
530#endif
531
532#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
533 /* Update stats while cb is freshly calculated.*/
534 pExecMemAllocator->cbAllocated -= cb;
535 pExecMemAllocator->cbFree += RT_ALIGN_Z(cb, 64);
536 pExecMemAllocator->cAllocations -= 1;
537
538 /* Free it. */
539 RTHeapSimpleFree(NIL_RTHEAPSIMPLE, pv);
540#endif
541}
542
543
544
545#ifdef IN_RING3
546# ifdef RT_OS_WINDOWS
547
548/**
549 * Initializes the unwind info structures for windows hosts.
550 */
551static int
552iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, void *pvChunk, uint32_t idxChunk)
553{
554 /*
555 * The AMD64 unwind opcodes.
556 *
557 * This is a program that starts with RSP after a RET instruction that
558 * ends up in recompiled code, and the operations we describe here will
559 * restore all non-volatile registers and bring RSP back to where our
560 * RET address is. This means it's reverse order from what happens in
561 * the prologue.
562 *
563 * Note! Using a frame register approach here both because we have one
564 * and but mainly because the UWOP_ALLOC_LARGE argument values
565 * would be a pain to write initializers for. On the positive
566 * side, we're impervious to changes in the the stack variable
567 * area can can deal with dynamic stack allocations if necessary.
568 */
569 static const IMAGE_UNWIND_CODE s_aOpcodes[] =
570 {
571 { { 16, IMAGE_AMD64_UWOP_SET_FPREG, 0 } }, /* RSP = RBP - FrameOffset * 10 (0x60) */
572 { { 16, IMAGE_AMD64_UWOP_ALLOC_SMALL, 0 } }, /* RSP += 8; */
573 { { 14, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x15 } }, /* R15 = [RSP]; RSP += 8; */
574 { { 12, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x14 } }, /* R14 = [RSP]; RSP += 8; */
575 { { 10, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x13 } }, /* R13 = [RSP]; RSP += 8; */
576 { { 8, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x12 } }, /* R12 = [RSP]; RSP += 8; */
577 { { 7, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xDI } }, /* RDI = [RSP]; RSP += 8; */
578 { { 6, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xSI } }, /* RSI = [RSP]; RSP += 8; */
579 { { 5, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBX } }, /* RBX = [RSP]; RSP += 8; */
580 { { 4, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBP } }, /* RBP = [RSP]; RSP += 8; */
581 };
582 union
583 {
584 IMAGE_UNWIND_INFO Info;
585 uint8_t abPadding[RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes) + 16];
586 } s_UnwindInfo =
587 {
588 {
589 /* .Version = */ 1,
590 /* .Flags = */ 0,
591 /* .SizeOfProlog = */ 16, /* whatever */
592 /* .CountOfCodes = */ RT_ELEMENTS(s_aOpcodes),
593 /* .FrameRegister = */ X86_GREG_xBP,
594 /* .FrameOffset = */ (-IEMNATIVE_FP_OFF_LAST_PUSH + 8) / 16 /* we're off by one slot. sigh. */,
595 }
596 };
597 AssertCompile(-IEMNATIVE_FP_OFF_LAST_PUSH < 240 && -IEMNATIVE_FP_OFF_LAST_PUSH > 0);
598 AssertCompile((-IEMNATIVE_FP_OFF_LAST_PUSH & 0xf) == 8);
599
600 /*
601 * Calc how much space we need and allocate it off the exec heap.
602 */
603 unsigned const cFunctionEntries = 1;
604 unsigned const cbUnwindInfo = sizeof(s_aOpcodes) + RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes);
605 unsigned const cbNeeded = sizeof(IMAGE_RUNTIME_FUNCTION_ENTRY) * cFunctionEntries + cbUnwindInfo;
606# ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
607 unsigned const cbNeededAligned = RT_ALIGN_32(cbNeeded, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
608 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions
609 = (PIMAGE_RUNTIME_FUNCTION_ENTRY)iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbNeededAligned);
610# else
611 unsigned const cbNeededAligned = RT_ALIGN_32(cbNeeded + pExecMemAllocator->cbHeapBlockHdr, 64)
612 - pExecMemAllocator->cbHeapBlockHdr;
613 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions = (PIMAGE_RUNTIME_FUNCTION_ENTRY)RTHeapSimpleAlloc(hHeap, cbNeededAligned,
614 32 /*cbAlignment*/);
615# endif
616 AssertReturn(paFunctions, VERR_INTERNAL_ERROR_5);
617 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = paFunctions;
618
619 /*
620 * Initialize the structures.
621 */
622 PIMAGE_UNWIND_INFO const pInfo = (PIMAGE_UNWIND_INFO)&paFunctions[cFunctionEntries];
623
624 paFunctions[0].BeginAddress = 0;
625 paFunctions[0].EndAddress = pExecMemAllocator->cbChunk;
626 paFunctions[0].UnwindInfoAddress = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk);
627
628 memcpy(pInfo, &s_UnwindInfo, RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes));
629 memcpy(&pInfo->aOpcodes[0], s_aOpcodes, sizeof(s_aOpcodes));
630
631 /*
632 * Register it.
633 */
634 uint8_t fRet = RtlAddFunctionTable(paFunctions, cFunctionEntries, (uintptr_t)pvChunk);
635 AssertReturn(fRet, NULL); /* Nothing to clean up on failure, since its within the chunk itself. */
636
637 return paFunctions;
638}
639
640
641# else /* !RT_OS_WINDOWS */
642
643/**
644 * Emits a LEB128 encoded value between -0x2000 and 0x2000 (both exclusive).
645 */
646DECLINLINE(RTPTRUNION) iemDwarfPutLeb128(RTPTRUNION Ptr, int32_t iValue)
647{
648 if (iValue >= 64)
649 {
650 Assert(iValue < 0x2000);
651 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
652 *Ptr.pb++ = (uint8_t)(iValue >> 7) & 0x3f;
653 }
654 else if (iValue >= 0)
655 *Ptr.pb++ = (uint8_t)iValue;
656 else if (iValue > -64)
657 *Ptr.pb++ = ((uint8_t)iValue & 0x3f) | 0x40;
658 else
659 {
660 Assert(iValue > -0x2000);
661 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
662 *Ptr.pb++ = ((uint8_t)(iValue >> 7) & 0x3f) | 0x40;
663 }
664 return Ptr;
665}
666
667
668/**
669 * Emits an ULEB128 encoded value (up to 64-bit wide).
670 */
671DECLINLINE(RTPTRUNION) iemDwarfPutUleb128(RTPTRUNION Ptr, uint64_t uValue)
672{
673 while (uValue >= 0x80)
674 {
675 *Ptr.pb++ = ((uint8_t)uValue & 0x7f) | 0x80;
676 uValue >>= 7;
677 }
678 *Ptr.pb++ = (uint8_t)uValue;
679 return Ptr;
680}
681
682
683/**
684 * Emits a CFA rule as register @a uReg + offset @a off.
685 */
686DECLINLINE(RTPTRUNION) iemDwarfPutCfaDefCfa(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
687{
688 *Ptr.pb++ = DW_CFA_def_cfa;
689 Ptr = iemDwarfPutUleb128(Ptr, uReg);
690 Ptr = iemDwarfPutUleb128(Ptr, off);
691 return Ptr;
692}
693
694
695/**
696 * Emits a register (@a uReg) save location:
697 * CFA + @a off * data_alignment_factor
698 */
699DECLINLINE(RTPTRUNION) iemDwarfPutCfaOffset(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
700{
701 if (uReg < 0x40)
702 *Ptr.pb++ = DW_CFA_offset | uReg;
703 else
704 {
705 *Ptr.pb++ = DW_CFA_offset_extended;
706 Ptr = iemDwarfPutUleb128(Ptr, uReg);
707 }
708 Ptr = iemDwarfPutUleb128(Ptr, off);
709 return Ptr;
710}
711
712
713# if 0 /* unused */
714/**
715 * Emits a register (@a uReg) save location, using signed offset:
716 * CFA + @a offSigned * data_alignment_factor
717 */
718DECLINLINE(RTPTRUNION) iemDwarfPutCfaSignedOffset(RTPTRUNION Ptr, uint32_t uReg, int32_t offSigned)
719{
720 *Ptr.pb++ = DW_CFA_offset_extended_sf;
721 Ptr = iemDwarfPutUleb128(Ptr, uReg);
722 Ptr = iemDwarfPutLeb128(Ptr, offSigned);
723 return Ptr;
724}
725# endif
726
727
728/**
729 * Initializes the unwind info section for non-windows hosts.
730 */
731static int
732iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, void *pvChunk, uint32_t idxChunk)
733{
734 PIEMEXECMEMCHUNKEHFRAME const pEhFrame = &pExecMemAllocator->paEhFrames[idxChunk];
735 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = pEhFrame; /* not necessary, but whatever */
736
737 RTPTRUNION Ptr = { pEhFrame->abEhFrame };
738
739 /*
740 * Generate the CIE first.
741 */
742# ifdef IEMNATIVE_USE_LIBUNWIND /* libunwind (llvm, darwin) only supports v1 and v3. */
743 uint8_t const iDwarfVer = 3;
744# else
745 uint8_t const iDwarfVer = 4;
746# endif
747 RTPTRUNION const PtrCie = Ptr;
748 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
749 *Ptr.pu32++ = 0 /*UINT32_MAX*/; /* I'm a CIE in .eh_frame speak. */
750 *Ptr.pb++ = iDwarfVer; /* DwARF version */
751 *Ptr.pb++ = 0; /* Augmentation. */
752 if (iDwarfVer >= 4)
753 {
754 *Ptr.pb++ = sizeof(uintptr_t); /* Address size. */
755 *Ptr.pb++ = 0; /* Segment selector size. */
756 }
757# ifdef RT_ARCH_AMD64
758 Ptr = iemDwarfPutLeb128(Ptr, 1); /* Code alignment factor (LEB128 = 1). */
759# else
760 Ptr = iemDwarfPutLeb128(Ptr, 4); /* Code alignment factor (LEB128 = 4). */
761# endif
762 Ptr = iemDwarfPutLeb128(Ptr, -8); /* Data alignment factor (LEB128 = -8). */
763# ifdef RT_ARCH_AMD64
764 Ptr = iemDwarfPutUleb128(Ptr, DWREG_AMD64_RA); /* Return address column (ULEB128) */
765# elif defined(RT_ARCH_ARM64)
766 Ptr = iemDwarfPutUleb128(Ptr, DWREG_ARM64_LR); /* Return address column (ULEB128) */
767# else
768# error "port me"
769# endif
770 /* Initial instructions: */
771# ifdef RT_ARCH_AMD64
772 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_AMD64_RBP, 16); /* CFA = RBP + 0x10 - first stack parameter */
773 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RA, 1); /* Ret RIP = [CFA + 1*-8] */
774 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBP, 2); /* RBP = [CFA + 2*-8] */
775 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBX, 3); /* RBX = [CFA + 3*-8] */
776 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R12, 4); /* R12 = [CFA + 4*-8] */
777 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R13, 5); /* R13 = [CFA + 5*-8] */
778 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R14, 6); /* R14 = [CFA + 6*-8] */
779 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R15, 7); /* R15 = [CFA + 7*-8] */
780# elif defined(RT_ARCH_ARM64)
781# if 1
782 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_BP, 16); /* CFA = BP + 0x10 - first stack parameter */
783# else
784 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_SP, IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_SAVE_REG_SIZE);
785# endif
786 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_LR, 1); /* Ret PC = [CFA + 1*-8] */
787 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_BP, 2); /* Ret BP = [CFA + 2*-8] */
788 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X28, 3); /* X28 = [CFA + 3*-8] */
789 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X27, 4); /* X27 = [CFA + 4*-8] */
790 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X26, 5); /* X26 = [CFA + 5*-8] */
791 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X25, 6); /* X25 = [CFA + 6*-8] */
792 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X24, 7); /* X24 = [CFA + 7*-8] */
793 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X23, 8); /* X23 = [CFA + 8*-8] */
794 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X22, 9); /* X22 = [CFA + 9*-8] */
795 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X21, 10); /* X21 = [CFA +10*-8] */
796 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X20, 11); /* X20 = [CFA +11*-8] */
797 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X19, 12); /* X19 = [CFA +12*-8] */
798 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
799 /** @todo we we need to do something about clearing DWREG_ARM64_RA_SIGN_STATE or something? */
800# else
801# error "port me"
802# endif
803 while ((Ptr.u - PtrCie.u) & 3)
804 *Ptr.pb++ = DW_CFA_nop;
805 /* Finalize the CIE size. */
806 *PtrCie.pu32 = Ptr.u - PtrCie.u - sizeof(uint32_t);
807
808 /*
809 * Generate an FDE for the whole chunk area.
810 */
811# ifdef IEMNATIVE_USE_LIBUNWIND
812 pEhFrame->offFda = Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0];
813# endif
814 RTPTRUNION const PtrFde = Ptr;
815 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
816 *Ptr.pu32 = Ptr.u - PtrCie.u; /* Negated self relative CIE address. */
817 Ptr.pu32++;
818 *Ptr.pu64++ = (uintptr_t)pvChunk; /* Absolute start PC of this FDE. */
819 *Ptr.pu64++ = pExecMemAllocator->cbChunk; /* PC range length for this PDE. */
820# if 0 /* not requried for recent libunwind.dylib nor recent libgcc/glib. */
821 *Ptr.pb++ = DW_CFA_nop;
822# endif
823 while ((Ptr.u - PtrFde.u) & 3)
824 *Ptr.pb++ = DW_CFA_nop;
825 /* Finalize the FDE size. */
826 *PtrFde.pu32 = Ptr.u - PtrFde.u - sizeof(uint32_t);
827
828 /* Terminator entry. */
829 *Ptr.pu32++ = 0;
830 *Ptr.pu32++ = 0; /* just to be sure... */
831 Assert(Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0] <= sizeof(pEhFrame->abEhFrame));
832
833 /*
834 * Register it.
835 */
836# ifdef IEMNATIVE_USE_LIBUNWIND
837 __register_frame(&pEhFrame->abEhFrame[pEhFrame->offFda]);
838# else
839 memset(pEhFrame->abObject, 0xf6, sizeof(pEhFrame->abObject)); /* color the memory to better spot usage */
840 __register_frame_info(pEhFrame->abEhFrame, pEhFrame->abObject);
841# endif
842
843 return VINF_SUCCESS;
844}
845
846# endif /* !RT_OS_WINDOWS */
847#endif /* IN_RING3 */
848
849
850/**
851 * Adds another chunk to the executable memory allocator.
852 *
853 * This is used by the init code for the initial allocation and later by the
854 * regular allocator function when it's out of memory.
855 */
856static int iemExecMemAllocatorGrow(PIEMEXECMEMALLOCATOR pExecMemAllocator)
857{
858 /* Check that we've room for growth. */
859 uint32_t const idxChunk = pExecMemAllocator->cChunks;
860 AssertLogRelReturn(idxChunk < pExecMemAllocator->cMaxChunks, VERR_OUT_OF_RESOURCES);
861
862 /* Allocate a chunk. */
863#ifdef RT_OS_DARWIN
864 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, 0);
865#else
866 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, RTMEMPAGEALLOC_F_EXECUTABLE);
867#endif
868 AssertLogRelReturn(pvChunk, VERR_NO_EXEC_MEMORY);
869
870#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
871 int rc = VINF_SUCCESS;
872#else
873 /* Initialize the heap for the chunk. */
874 RTHEAPSIMPLE hHeap = NIL_RTHEAPSIMPLE;
875 int rc = RTHeapSimpleInit(&hHeap, pvChunk, pExecMemAllocator->cbChunk);
876 AssertRC(rc);
877 if (RT_SUCCESS(rc))
878 {
879 /*
880 * We want the memory to be aligned on 64 byte, so the first time thru
881 * here we do some exploratory allocations to see how we can achieve this.
882 * On subsequent runs we only make an initial adjustment allocation, if
883 * necessary.
884 *
885 * Since we own the heap implementation, we know that the internal block
886 * header is 32 bytes in size for 64-bit systems (see RTHEAPSIMPLEBLOCK),
887 * so all we need to wrt allocation size adjustments is to add 32 bytes
888 * to the size, align up by 64 bytes, and subtract 32 bytes.
889 *
890 * The heap anchor block is 8 * sizeof(void *) (see RTHEAPSIMPLEINTERNAL),
891 * which mean 64 bytes on a 64-bit system, so we need to make a 64 byte
892 * allocation to force subsequent allocations to return 64 byte aligned
893 * user areas.
894 */
895 if (!pExecMemAllocator->cbHeapBlockHdr)
896 {
897 pExecMemAllocator->cbHeapBlockHdr = sizeof(void *) * 4; /* See RTHEAPSIMPLEBLOCK. */
898 pExecMemAllocator->cbHeapAlignTweak = 64;
899 pExecMemAllocator->pvAlignTweak = RTHeapSimpleAlloc(hHeap, pExecMemAllocator->cbHeapAlignTweak,
900 32 /*cbAlignment*/);
901 AssertStmt(pExecMemAllocator->pvAlignTweak, rc = VERR_INTERNAL_ERROR_2);
902
903 void *pvTest1 = RTHeapSimpleAlloc(hHeap,
904 RT_ALIGN_32(256 + pExecMemAllocator->cbHeapBlockHdr, 64)
905 - pExecMemAllocator->cbHeapBlockHdr, 32 /*cbAlignment*/);
906 AssertStmt(pvTest1, rc = VERR_INTERNAL_ERROR_2);
907 AssertStmt(!((uintptr_t)pvTest1 & 63), rc = VERR_INTERNAL_ERROR_3);
908
909 void *pvTest2 = RTHeapSimpleAlloc(hHeap,
910 RT_ALIGN_32(687 + pExecMemAllocator->cbHeapBlockHdr, 64)
911 - pExecMemAllocator->cbHeapBlockHdr, 32 /*cbAlignment*/);
912 AssertStmt(pvTest2, rc = VERR_INTERNAL_ERROR_2);
913 AssertStmt(!((uintptr_t)pvTest2 & 63), rc = VERR_INTERNAL_ERROR_3);
914
915 RTHeapSimpleFree(hHeap, pvTest2);
916 RTHeapSimpleFree(hHeap, pvTest1);
917 }
918 else
919 {
920 pExecMemAllocator->pvAlignTweak = RTHeapSimpleAlloc(hHeap, pExecMemAllocator->cbHeapAlignTweak, 32 /*cbAlignment*/);
921 AssertStmt(pExecMemAllocator->pvAlignTweak, rc = VERR_INTERNAL_ERROR_4);
922 }
923 if (RT_SUCCESS(rc))
924#endif /* !IEMEXECMEM_USE_ALT_SUB_ALLOCATOR */
925 {
926 /*
927 * Add the chunk.
928 *
929 * This must be done before the unwind init so windows can allocate
930 * memory from the chunk when using the alternative sub-allocator.
931 */
932 pExecMemAllocator->aChunks[idxChunk].pvChunk = pvChunk;
933#ifdef IN_RING3
934 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = NULL;
935#endif
936#ifndef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
937 pExecMemAllocator->aChunks[idxChunk].hHeap = hHeap;
938#else
939 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = pExecMemAllocator->cUnitsPerChunk;
940 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = 0;
941 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
942 0, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
943#endif
944
945 pExecMemAllocator->cChunks = idxChunk + 1;
946 pExecMemAllocator->idxChunkHint = idxChunk;
947
948#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
949 pExecMemAllocator->cbTotal += pExecMemAllocator->cbChunk;
950 pExecMemAllocator->cbFree += pExecMemAllocator->cbChunk;
951#else
952 size_t const cbFree = RTHeapSimpleGetFreeSize(hHeap);
953 pExecMemAllocator->cbTotal += cbFree;
954 pExecMemAllocator->cbFree += cbFree;
955#endif
956
957#ifdef IN_RING3
958 /*
959 * Initialize the unwind information (this cannot really fail atm).
960 * (This sets pvUnwindInfo.)
961 */
962 rc = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pExecMemAllocator, pvChunk, idxChunk);
963 if (RT_SUCCESS(rc))
964#endif
965 {
966 return VINF_SUCCESS;
967 }
968
969#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
970 /* Just in case the impossible happens, undo the above up: */
971 pExecMemAllocator->cbTotal -= pExecMemAllocator->cbChunk;
972 pExecMemAllocator->cbFree -= pExecMemAllocator->aChunks[idxChunk].cFreeUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
973 pExecMemAllocator->cChunks = idxChunk;
974 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
975 0xff, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
976 pExecMemAllocator->aChunks[idxChunk].pvChunk = NULL;
977 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = 0;
978#endif
979 }
980#ifndef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
981 }
982#endif
983 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
984 return rc;
985}
986
987
988/**
989 * Initializes the executable memory allocator for native recompilation on the
990 * calling EMT.
991 *
992 * @returns VBox status code.
993 * @param pVCpu The cross context virtual CPU structure of the calling
994 * thread.
995 * @param cbMax The max size of the allocator.
996 * @param cbInitial The initial allocator size.
997 * @param cbChunk The chunk size, 0 or UINT32_MAX for default (@a cbMax
998 * dependent).
999 */
1000int iemExecMemAllocatorInit(PVMCPU pVCpu, uint64_t cbMax, uint64_t cbInitial, uint32_t cbChunk)
1001{
1002 /*
1003 * Validate input.
1004 */
1005 AssertLogRelMsgReturn(cbMax >= _1M && cbMax <= _4G+_4G, ("cbMax=%RU64 (%RX64)\n", cbMax, cbMax), VERR_OUT_OF_RANGE);
1006 AssertReturn(cbInitial <= cbMax, VERR_OUT_OF_RANGE);
1007 AssertLogRelMsgReturn( cbChunk != UINT32_MAX
1008 || cbChunk == 0
1009 || ( RT_IS_POWER_OF_TWO(cbChunk)
1010 && cbChunk >= _1M
1011 && cbChunk <= _256M
1012 && cbChunk <= cbMax),
1013 ("cbChunk=%RU32 (%RX32) cbMax=%RU64\n", cbChunk, cbChunk, cbMax),
1014 VERR_OUT_OF_RANGE);
1015
1016 /*
1017 * Adjust/figure out the chunk size.
1018 */
1019 if (cbChunk == 0 || cbChunk == UINT32_MAX)
1020 {
1021 if (cbMax >= _256M)
1022 cbChunk = _64M;
1023 else
1024 {
1025 if (cbMax < _16M)
1026 cbChunk = cbMax >= _4M ? _4M : (uint32_t)cbMax;
1027 else
1028 cbChunk = (uint32_t)cbMax / 4;
1029 if (!RT_IS_POWER_OF_TWO(cbChunk))
1030 cbChunk = RT_BIT_32(ASMBitLastSetU32(cbChunk));
1031 }
1032 }
1033
1034 if (cbChunk > cbMax)
1035 cbMax = cbChunk;
1036 else
1037 cbMax = (cbMax - 1 + cbChunk) / cbChunk * cbChunk;
1038 uint32_t const cMaxChunks = (uint32_t)(cbMax / cbChunk);
1039 AssertLogRelReturn((uint64_t)cMaxChunks * cbChunk == cbMax, VERR_INTERNAL_ERROR_3);
1040
1041 /*
1042 * Allocate and initialize the allocatore instance.
1043 */
1044 size_t cbNeeded = RT_UOFFSETOF_DYN(IEMEXECMEMALLOCATOR, aChunks[cMaxChunks]);
1045#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
1046 size_t const offBitmaps = RT_ALIGN_Z(cbNeeded, RT_CACHELINE_SIZE);
1047 size_t const cbBitmap = cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3);
1048 cbNeeded += cbBitmap * cMaxChunks;
1049 AssertCompile(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT <= 10);
1050 Assert(cbChunk > RT_BIT_32(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3));
1051#endif
1052#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
1053 size_t const offEhFrames = RT_ALIGN_Z(cbNeeded, RT_CACHELINE_SIZE);
1054 cbNeeded += sizeof(IEMEXECMEMCHUNKEHFRAME) * cMaxChunks;
1055#endif
1056 PIEMEXECMEMALLOCATOR pExecMemAllocator = (PIEMEXECMEMALLOCATOR)RTMemAllocZ(cbNeeded);
1057 AssertLogRelMsgReturn(pExecMemAllocator, ("cbNeeded=%zx cMaxChunks=%#x cbChunk=%#x\n", cbNeeded, cMaxChunks, cbChunk),
1058 VERR_NO_MEMORY);
1059 pExecMemAllocator->uMagic = IEMEXECMEMALLOCATOR_MAGIC;
1060 pExecMemAllocator->cbChunk = cbChunk;
1061 pExecMemAllocator->cMaxChunks = cMaxChunks;
1062 pExecMemAllocator->cChunks = 0;
1063 pExecMemAllocator->idxChunkHint = 0;
1064 pExecMemAllocator->cAllocations = 0;
1065 pExecMemAllocator->cbTotal = 0;
1066 pExecMemAllocator->cbFree = 0;
1067 pExecMemAllocator->cbAllocated = 0;
1068#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
1069 pExecMemAllocator->pbmAlloc = (uint64_t *)((uintptr_t)pExecMemAllocator + offBitmaps);
1070 pExecMemAllocator->cUnitsPerChunk = cbChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1071 pExecMemAllocator->cBitmapElementsPerChunk = cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 6);
1072 memset(pExecMemAllocator->pbmAlloc, 0xff, cbBitmap); /* Mark everything as allocated. Clear when chunks are added. */
1073#endif
1074#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
1075 pExecMemAllocator->paEhFrames = (PIEMEXECMEMCHUNKEHFRAME)((uintptr_t)pExecMemAllocator + offEhFrames);
1076#endif
1077 for (uint32_t i = 0; i < cMaxChunks; i++)
1078 {
1079#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
1080 pExecMemAllocator->aChunks[i].cFreeUnits = 0;
1081 pExecMemAllocator->aChunks[i].idxFreeHint = 0;
1082#else
1083 pExecMemAllocator->aChunks[i].hHeap = NIL_RTHEAPSIMPLE;
1084#endif
1085 pExecMemAllocator->aChunks[i].pvChunk = NULL;
1086#ifdef IN_RING0
1087 pExecMemAllocator->aChunks[i].hMemObj = NIL_RTR0MEMOBJ;
1088#else
1089 pExecMemAllocator->aChunks[i].pvUnwindInfo = NULL;
1090#endif
1091 }
1092 pVCpu->iem.s.pExecMemAllocatorR3 = pExecMemAllocator;
1093
1094 /*
1095 * Do the initial allocations.
1096 */
1097 while (cbInitial < (uint64_t)pExecMemAllocator->cChunks * pExecMemAllocator->cbChunk)
1098 {
1099 int rc = iemExecMemAllocatorGrow(pExecMemAllocator);
1100 AssertLogRelRCReturn(rc, rc);
1101 }
1102
1103 pExecMemAllocator->idxChunkHint = 0;
1104
1105 return VINF_SUCCESS;
1106}
1107
1108
1109/*********************************************************************************************************************************
1110* Native Recompilation *
1111*********************************************************************************************************************************/
1112
1113
1114/**
1115 * Used by TB code when encountering a non-zero status or rcPassUp after a call.
1116 */
1117IEM_DECL_IMPL_DEF(int, iemNativeHlpExecStatusCodeFiddling,(PVMCPUCC pVCpu, int rc, uint8_t idxInstr))
1118{
1119 pVCpu->iem.s.cInstructions += idxInstr;
1120 return VBOXSTRICTRC_VAL(iemExecStatusCodeFiddling(pVCpu, rc == VINF_IEM_REEXEC_BREAK ? VINF_SUCCESS : rc));
1121}
1122
1123
1124/**
1125 * Reinitializes the native recompiler state.
1126 *
1127 * Called before starting a new recompile job.
1128 */
1129static PIEMRECOMPILERSTATE iemNativeReInit(PIEMRECOMPILERSTATE pReNative, PCIEMTB pTb)
1130{
1131 pReNative->cLabels = 0;
1132 pReNative->cFixups = 0;
1133 pReNative->pTbOrg = pTb;
1134 return pReNative;
1135}
1136
1137
1138/**
1139 * Allocates and initializes the native recompiler state.
1140 *
1141 * This is called the first time an EMT wants to recompile something.
1142 *
1143 * @returns Pointer to the new recompiler state.
1144 * @param pVCpu The cross context virtual CPU structure of the calling
1145 * thread.
1146 * @param pTb The TB that's about to be recompiled.
1147 * @thread EMT(pVCpu)
1148 */
1149static PIEMRECOMPILERSTATE iemNativeInit(PVMCPUCC pVCpu, PCIEMTB pTb)
1150{
1151 VMCPU_ASSERT_EMT(pVCpu);
1152
1153 PIEMRECOMPILERSTATE pReNative = (PIEMRECOMPILERSTATE)RTMemAllocZ(sizeof(*pReNative));
1154 AssertReturn(pReNative, NULL);
1155
1156 /*
1157 * Try allocate all the buffers and stuff we need.
1158 */
1159 pReNative->pInstrBuf = (PIEMNATIVEINSTR)RTMemAllocZ(_64K);
1160 pReNative->paLabels = (PIEMNATIVELABEL)RTMemAllocZ(sizeof(IEMNATIVELABEL) * _8K);
1161 pReNative->paFixups = (PIEMNATIVEFIXUP)RTMemAllocZ(sizeof(IEMNATIVEFIXUP) * _16K);
1162 if (RT_LIKELY( pReNative->pInstrBuf
1163 && pReNative->paLabels
1164 && pReNative->paFixups))
1165 {
1166 /*
1167 * Set the buffer & array sizes on success.
1168 */
1169 pReNative->cInstrBufAlloc = _64K / sizeof(IEMNATIVEINSTR);
1170 pReNative->cLabelsAlloc = _8K;
1171 pReNative->cFixupsAlloc = _16K;
1172
1173 /*
1174 * Done, just need to save it and reinit it.
1175 */
1176 pVCpu->iem.s.pNativeRecompilerStateR3 = pReNative;
1177 return iemNativeReInit(pReNative, pTb);
1178 }
1179
1180 /*
1181 * Failed. Cleanup and return.
1182 */
1183 AssertFailed();
1184 RTMemFree(pReNative->pInstrBuf);
1185 RTMemFree(pReNative->paLabels);
1186 RTMemFree(pReNative->paFixups);
1187 RTMemFree(pReNative);
1188 return NULL;
1189}
1190
1191
1192/**
1193 * Defines a label.
1194 *
1195 * @returns Label ID.
1196 * @param pReNative The native recompile state.
1197 * @param enmType The label type.
1198 * @param offWhere The instruction offset of the label. UINT32_MAX if the
1199 * label is not yet defined (default).
1200 * @param uData Data associated with the lable. Only applicable to
1201 * certain type of labels. Default is zero.
1202 */
1203DECLHIDDEN(uint32_t) iemNativeMakeLabel(PIEMRECOMPILERSTATE pReNative, IEMNATIVELABELTYPE enmType,
1204 uint32_t offWhere /*= UINT32_MAX*/, uint16_t uData /*= 0*/) RT_NOEXCEPT
1205{
1206 /*
1207 * Do we have the label already?
1208 */
1209 PIEMNATIVELABEL paLabels = pReNative->paLabels;
1210 uint32_t const cLabels = pReNative->cLabels;
1211 for (uint32_t i = 0; i < cLabels; i++)
1212 if ( paLabels[i].enmType == enmType
1213 && paLabels[i].uData == uData)
1214 {
1215 if (paLabels[i].off == offWhere || offWhere == UINT32_MAX)
1216 return i;
1217 if (paLabels[i].off == UINT32_MAX)
1218 {
1219 paLabels[i].off = offWhere;
1220 return i;
1221 }
1222 }
1223
1224 /*
1225 * Make sure we've got room for another label.
1226 */
1227 if (RT_LIKELY(cLabels < pReNative->cLabelsAlloc))
1228 { /* likely */ }
1229 else
1230 {
1231 uint32_t cNew = pReNative->cLabelsAlloc;
1232 AssertReturn(cNew, UINT32_MAX);
1233 AssertReturn(cLabels == cNew, UINT32_MAX);
1234 cNew *= 2;
1235 AssertReturn(cNew <= _64K, UINT32_MAX); /* IEMNATIVEFIXUP::idxLabel type restrict this */
1236 paLabels = (PIEMNATIVELABEL)RTMemRealloc(paLabels, cNew * sizeof(paLabels[0]));
1237 AssertReturn(paLabels, UINT32_MAX);
1238 pReNative->paLabels = paLabels;
1239 pReNative->cLabelsAlloc = cNew;
1240 }
1241
1242 /*
1243 * Define a new label.
1244 */
1245 paLabels[cLabels].off = offWhere;
1246 paLabels[cLabels].enmType = enmType;
1247 paLabels[cLabels].uData = uData;
1248 pReNative->cLabels = cLabels + 1;
1249 return cLabels;
1250}
1251
1252
1253/**
1254 * Looks up a lable.
1255 *
1256 * @returns Label ID if found, UINT32_MAX if not.
1257 */
1258static uint32_t iemNativeFindLabel(PIEMRECOMPILERSTATE pReNative, IEMNATIVELABELTYPE enmType,
1259 uint32_t offWhere = UINT32_MAX, uint16_t uData = 0) RT_NOEXCEPT
1260{
1261 PIEMNATIVELABEL paLabels = pReNative->paLabels;
1262 uint32_t const cLabels = pReNative->cLabels;
1263 for (uint32_t i = 0; i < cLabels; i++)
1264 if ( paLabels[i].enmType == enmType
1265 && paLabels[i].uData == uData
1266 && ( paLabels[i].off == offWhere
1267 || offWhere == UINT32_MAX
1268 || paLabels[i].off == UINT32_MAX))
1269 return i;
1270 return UINT32_MAX;
1271}
1272
1273
1274
1275/**
1276 * Adds a fixup.
1277 *
1278 * @returns Success indicator.
1279 * @param pReNative The native recompile state.
1280 * @param offWhere The instruction offset of the fixup location.
1281 * @param idxLabel The target label ID for the fixup.
1282 * @param enmType The fixup type.
1283 * @param offAddend Fixup addend if applicable to the type. Default is 0.
1284 */
1285DECLHIDDEN(bool) iemNativeAddFixup(PIEMRECOMPILERSTATE pReNative, uint32_t offWhere, uint32_t idxLabel,
1286 IEMNATIVEFIXUPTYPE enmType, int8_t offAddend /*= 0*/) RT_NOEXCEPT
1287{
1288 Assert(idxLabel <= UINT16_MAX);
1289 Assert((unsigned)enmType <= UINT8_MAX);
1290
1291 /*
1292 * Make sure we've room.
1293 */
1294 PIEMNATIVEFIXUP paFixups = pReNative->paFixups;
1295 uint32_t const cFixups = pReNative->cFixups;
1296 if (RT_LIKELY(cFixups < pReNative->cFixupsAlloc))
1297 { /* likely */ }
1298 else
1299 {
1300 uint32_t cNew = pReNative->cFixupsAlloc;
1301 AssertReturn(cNew, false);
1302 AssertReturn(cFixups == cNew, false);
1303 cNew *= 2;
1304 AssertReturn(cNew <= _128K, false);
1305 paFixups = (PIEMNATIVEFIXUP)RTMemRealloc(paFixups, cNew * sizeof(paFixups[0]));
1306 AssertReturn(paFixups, false);
1307 pReNative->paFixups = paFixups;
1308 pReNative->cFixupsAlloc = cNew;
1309 }
1310
1311 /*
1312 * Add the fixup.
1313 */
1314 paFixups[cFixups].off = offWhere;
1315 paFixups[cFixups].idxLabel = (uint16_t)idxLabel;
1316 paFixups[cFixups].enmType = enmType;
1317 paFixups[cFixups].offAddend = offAddend;
1318 pReNative->cFixups = cFixups + 1;
1319 return true;
1320}
1321
1322/**
1323 * Slow code path for iemNativeInstrBufEnsure.
1324 */
1325DECLHIDDEN(PIEMNATIVEINSTR) iemNativeInstrBufEnsureSlow(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1326 uint32_t cInstrReq) RT_NOEXCEPT
1327{
1328 /* Double the buffer size till we meet the request. */
1329 uint32_t cNew = pReNative->cInstrBufAlloc;
1330 AssertReturn(cNew > 0, NULL);
1331 do
1332 cNew *= 2;
1333 while (cNew < off + cInstrReq);
1334
1335 uint32_t const cbNew = cNew * sizeof(IEMNATIVEINSTR);
1336 AssertReturn(cbNew <= _2M, NULL);
1337
1338 void *pvNew = RTMemRealloc(pReNative->pInstrBuf, cbNew);
1339 AssertReturn(pvNew, NULL);
1340
1341 pReNative->cInstrBufAlloc = cNew;
1342 return pReNative->pInstrBuf = (PIEMNATIVEINSTR)pvNew;
1343}
1344
1345
1346/**
1347 * Emits a code for checking the return code of a call and rcPassUp, returning
1348 * from the code if either are non-zero.
1349 */
1350DECLHIDDEN(uint32_t) iemNativeEmitCheckCallRetAndPassUp(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1351 uint8_t idxInstr) RT_NOEXCEPT
1352{
1353#ifdef RT_ARCH_AMD64
1354 /*
1355 * AMD64: eax = call status code.
1356 */
1357
1358 /* edx = rcPassUp */
1359 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, X86_GREG_xDX, RT_UOFFSETOF(VMCPUCC, iem.s.rcPassUp));
1360 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1361
1362 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 10);
1363 AssertReturn(pbCodeBuf, UINT32_MAX);
1364
1365 /* edx = eax | rcPassUp*/
1366 pbCodeBuf[off++] = 0x0b; /* or edx, eax */
1367 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xDX, X86_GREG_xAX);
1368
1369 /* Jump to non-zero status return path, loading cl with the instruction number. */
1370 pbCodeBuf[off++] = 0xb0 + X86_GREG_xCX; /* mov cl, imm8 (pCallEntry->idxInstr) */
1371 pbCodeBuf[off++] = idxInstr;
1372
1373 pbCodeBuf[off++] = 0x0f; /* jnz rel32 */
1374 pbCodeBuf[off++] = 0x85;
1375 uint32_t const idxLabel = iemNativeMakeLabel(pReNative, kIemNativeLabelType_NonZeroRetOrPassUp);
1376 AssertReturn(idxLabel != UINT32_MAX, UINT32_MAX);
1377 AssertReturn(iemNativeAddFixup(pReNative, off, idxLabel, kIemNativeFixupType_Rel32, -4), UINT32_MAX);
1378 pbCodeBuf[off++] = 0x00;
1379 pbCodeBuf[off++] = 0x00;
1380 pbCodeBuf[off++] = 0x00;
1381 pbCodeBuf[off++] = 0x00;
1382
1383 /* done. */
1384
1385#elif RT_ARCH_ARM64
1386 /*
1387 * ARM64: w0 = call status code.
1388 */
1389 off = iemNativeEmitLoadGprImm64(pReNative, off, ARMV8_A64_REG_X2, idxInstr); /** @todo 32-bit imm load? Fixed counter register? */
1390 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, ARMV8_A64_REG_X3, RT_UOFFSETOF(VMCPUCC, iem.s.rcPassUp));
1391
1392 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1393 AssertReturn(pu32CodeBuf, UINT32_MAX);
1394
1395 pu32CodeBuf[off++] = Armv8A64MkInstrOrr(ARMV8_A64_REG_X4, ARMV8_A64_REG_X3, ARMV8_A64_REG_X0, false /*f64Bit*/);
1396
1397 uint32_t const idxLabel = iemNativeMakeLabel(pReNative, kIemNativeLabelType_NonZeroRetOrPassUp);
1398 AssertReturn(idxLabel != UINT32_MAX, UINT32_MAX);
1399 AssertReturn(iemNativeAddFixup(pReNative, off, idxLabel, kIemNativeFixupType_RelImm19At5), UINT32_MAX);
1400 pu32CodeBuf[off++] = Armv8A64MkInstrCbzCbnz(true /*fJmpIfNotZero*/, ARMV8_A64_REG_X4, false /*f64Bit*/);
1401
1402#else
1403# error "port me"
1404#endif
1405 return off;
1406}
1407
1408
1409/**
1410 * Emits a call to a CImpl function or something similar.
1411 */
1412static int32_t iemNativeEmitCImplCall(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxInstr,
1413 uintptr_t pfnCImpl, uint8_t cbInstr, uint8_t cAddParams,
1414 uint64_t uParam0, uint64_t uParam1, uint64_t uParam2)
1415{
1416#ifdef VBOX_STRICT
1417 off = iemNativeEmitMarker(pReNative, off);
1418 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1419#endif
1420
1421 /*
1422 * Load the parameters.
1423 */
1424#if defined(RT_OS_WINDOWS) && defined(VBOXSTRICTRC_STRICT_ENABLED)
1425 /* Special code the hidden VBOXSTRICTRC pointer. */
1426 off = iemNativeEmitLoadGprFromGpr( pReNative, off, IEMNATIVE_CALL_ARG1_GREG, IEMNATIVE_REG_FIXED_PVMCPU);
1427 off = iemNativeEmitLoadGprImm64( pReNative, off, IEMNATIVE_CALL_ARG2_GREG, cbInstr); /** @todo 8-bit reg load opt for amd64 */
1428 if (cAddParams > 0)
1429 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG3_GREG, uParam0);
1430 if (cAddParams > 1)
1431 off = iemNativeEmitStoreImm64ByBp(pReNative, off, IEMNATIVE_FP_OFF_STACK_ARG0, uParam1);
1432 if (cAddParams > 2)
1433 off = iemNativeEmitStoreImm64ByBp(pReNative, off, IEMNATIVE_FP_OFF_STACK_ARG1, uParam2);
1434 off = iemNativeEmitLeaGrpByBp(pReNative, off, X86_GREG_xCX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict */
1435
1436#else
1437 AssertCompile(IEMNATIVE_CALL_ARG_GREG_COUNT >= 4);
1438 off = iemNativeEmitLoadGprFromGpr( pReNative, off, IEMNATIVE_CALL_ARG0_GREG, IEMNATIVE_REG_FIXED_PVMCPU);
1439 off = iemNativeEmitLoadGprImm64( pReNative, off, IEMNATIVE_CALL_ARG1_GREG, cbInstr); /** @todo 8-bit reg load opt for amd64 */
1440 if (cAddParams > 0)
1441 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG2_GREG, uParam0);
1442 if (cAddParams > 1)
1443 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG3_GREG, uParam1);
1444 if (cAddParams > 2)
1445# if IEMNATIVE_CALL_ARG_GREG_COUNT >= 5
1446 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG4_GREG, uParam2);
1447# else
1448 off = iemNativeEmitStoreImm64ByBp(pReNative, off, IEMNATIVE_FP_OFF_STACK_ARG0, uParam2);
1449# endif
1450#endif
1451 AssertReturn(off != UINT32_MAX, off);
1452
1453 /*
1454 * Make the call.
1455 */
1456#ifdef RT_ARCH_AMD64
1457 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xAX, pfnCImpl);
1458
1459 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1460 AssertReturn(pbCodeBuf, UINT32_MAX);
1461 pbCodeBuf[off++] = 0xff; /* call rax */
1462 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, X86_GREG_xAX);
1463
1464# if defined(VBOXSTRICTRC_STRICT_ENABLED) && defined(RT_OS_WINDOWS)
1465 off = iemNativeEmitLoadGprByBpU32(pReNative, off, X86_GREG_xAX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict (see above) */
1466# endif
1467
1468#elif defined(RT_ARCH_ARM64)
1469 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_REG_FIXED_TMP0, pfnCImpl);
1470
1471 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1472 AssertReturn(pu32CodeBuf, UINT32_MAX);
1473 pu32CodeBuf[off++] = Armv8A64MkInstrBlr(IEMNATIVE_REG_FIXED_TMP0);
1474
1475#else
1476# error "Port me!"
1477#endif
1478
1479 /*
1480 * Check the status code.
1481 */
1482 return iemNativeEmitCheckCallRetAndPassUp(pReNative, off, idxInstr);
1483}
1484
1485
1486/**
1487 * Emits a call to a threaded worker function.
1488 */
1489static int32_t iemNativeEmitThreadedCall(PIEMRECOMPILERSTATE pReNative, uint32_t off, PCIEMTHRDEDCALLENTRY pCallEntry)
1490{
1491#ifdef VBOX_STRICT
1492 off = iemNativeEmitMarker(pReNative, off);
1493 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1494#endif
1495 uint8_t const cParams = g_acIemThreadedFunctionUsedArgs[pCallEntry->enmFunction];
1496
1497#ifdef RT_ARCH_AMD64
1498 /* Load the parameters and emit the call. */
1499# ifdef RT_OS_WINDOWS
1500# ifndef VBOXSTRICTRC_STRICT_ENABLED
1501 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xCX, IEMNATIVE_REG_FIXED_PVMCPU);
1502 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1503 if (cParams > 0)
1504 {
1505 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xDX, pCallEntry->auParams[0]);
1506 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1507 }
1508 if (cParams > 1)
1509 {
1510 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x8, pCallEntry->auParams[1]);
1511 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1512 }
1513 if (cParams > 2)
1514 {
1515 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x9, pCallEntry->auParams[2]);
1516 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1517 }
1518# else /* VBOXSTRICTRC: Returned via hidden parameter. Sigh. */
1519 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDX, IEMNATIVE_REG_FIXED_PVMCPU);
1520 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1521 if (cParams > 0)
1522 {
1523 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x8, pCallEntry->auParams[0]);
1524 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1525 }
1526 if (cParams > 1)
1527 {
1528 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x9, pCallEntry->auParams[1]);
1529 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1530 }
1531 if (cParams > 2)
1532 {
1533 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x10, pCallEntry->auParams[2]);
1534 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1535 }
1536 off = iemNativeEmitStoreGprByBp(pReNative, off, IEMNATIVE_FP_OFF_STACK_ARG0, X86_GREG_x10);
1537 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1538 off = iemNativeEmitLeaGrpByBp(pReNative, off, X86_GREG_xCX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict */
1539 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1540# endif /* VBOXSTRICTRC_STRICT_ENABLED */
1541# else
1542 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDI, IEMNATIVE_REG_FIXED_PVMCPU);
1543 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1544 if (cParams > 0)
1545 {
1546 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xSI, pCallEntry->auParams[0]);
1547 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1548 }
1549 if (cParams > 1)
1550 {
1551 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xDX, pCallEntry->auParams[1]);
1552 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1553 }
1554 if (cParams > 2)
1555 {
1556 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xCX, pCallEntry->auParams[2]);
1557 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1558 }
1559# endif
1560 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xAX, (uintptr_t)g_apfnIemThreadedFunctions[pCallEntry->enmFunction]);
1561 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1562
1563 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1564 AssertReturn(pbCodeBuf, UINT32_MAX);
1565 pbCodeBuf[off++] = 0xff; /* call rax */
1566 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, X86_GREG_xAX);
1567
1568# if defined(VBOXSTRICTRC_STRICT_ENABLED) && defined(RT_OS_WINDOWS)
1569 off = iemNativeEmitLoadGprByBpU32(pReNative, off, X86_GREG_xAX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict (see above) */
1570# endif
1571
1572#elif RT_ARCH_ARM64
1573 /*
1574 * ARM64:
1575 */
1576 off = iemNativeEmitLoadGprFromGpr(pReNative, off, IEMNATIVE_CALL_ARG0_GREG, IEMNATIVE_REG_FIXED_PVMCPU);
1577 if (cParams > 0)
1578 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG1_GREG, pCallEntry->auParams[0]);
1579 if (cParams > 1)
1580 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG2_GREG, pCallEntry->auParams[1]);
1581 if (cParams > 2)
1582 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG3_GREG, pCallEntry->auParams[2]);
1583 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_REG_FIXED_TMP0,
1584 (uintptr_t)g_apfnIemThreadedFunctions[pCallEntry->enmFunction]);
1585
1586 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1587 AssertReturn(pu32CodeBuf, UINT32_MAX);
1588
1589 pu32CodeBuf[off++] = Armv8A64MkInstrBlr(IEMNATIVE_REG_FIXED_TMP0);
1590
1591#else
1592# error "port me"
1593#endif
1594
1595 /*
1596 * Check the status code.
1597 */
1598 off = iemNativeEmitCheckCallRetAndPassUp(pReNative, off, pCallEntry->idxInstr);
1599 AssertReturn(off != UINT32_MAX, off);
1600
1601 return off;
1602}
1603
1604
1605/**
1606 * Emits the RC fiddling code for handling non-zero return code or rcPassUp.
1607 */
1608static uint32_t iemNativeEmitRcFiddling(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint32_t idxReturnLabel)
1609{
1610 /*
1611 * Generate the rc + rcPassUp fiddling code if needed.
1612 */
1613 uint32_t idxLabel = iemNativeFindLabel(pReNative, kIemNativeLabelType_NonZeroRetOrPassUp);
1614 if (idxLabel != UINT32_MAX)
1615 {
1616 Assert(pReNative->paLabels[idxLabel].off == UINT32_MAX);
1617 pReNative->paLabels[idxLabel].off = off;
1618
1619 /* iemNativeHlpExecStatusCodeFiddling(PVMCPUCC pVCpu, int rc, uint8_t idxInstr) */
1620#ifdef RT_ARCH_AMD64
1621 /*
1622 * AMD64:
1623 */
1624 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
1625 AssertReturn(pbCodeBuf, UINT32_MAX);
1626
1627 /* Call helper and jump to return point. */
1628# ifdef RT_OS_WINDOWS
1629 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_x8, X86_GREG_xCX); /* cl = instruction number */
1630 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1631 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xCX, IEMNATIVE_REG_FIXED_PVMCPU);
1632 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1633 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDX, X86_GREG_xAX);
1634 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1635# else
1636 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDI, IEMNATIVE_REG_FIXED_PVMCPU);
1637 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1638 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xSI, X86_GREG_xAX);
1639 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1640 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDX, X86_GREG_xCX); /* cl = instruction number */
1641 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1642# endif
1643 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xAX, (uintptr_t)iemNativeHlpExecStatusCodeFiddling);
1644 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1645
1646 pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 10);
1647 AssertReturn(pbCodeBuf, UINT32_MAX);
1648 pbCodeBuf[off++] = 0xff; /* call rax */
1649 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, X86_GREG_xAX);
1650
1651 /* Jump to common return point. */
1652 uint32_t offRel = pReNative->paLabels[idxReturnLabel].off - (off + 2);
1653 if (-(int32_t)offRel <= 127)
1654 {
1655 pbCodeBuf[off++] = 0xeb; /* jmp rel8 */
1656 pbCodeBuf[off++] = (uint8_t)offRel;
1657 off++;
1658 }
1659 else
1660 {
1661 offRel -= 3;
1662 pbCodeBuf[off++] = 0xe9; /* jmp rel32 */
1663 pbCodeBuf[off++] = RT_BYTE1(offRel);
1664 pbCodeBuf[off++] = RT_BYTE2(offRel);
1665 pbCodeBuf[off++] = RT_BYTE3(offRel);
1666 pbCodeBuf[off++] = RT_BYTE4(offRel);
1667 }
1668 pbCodeBuf[off++] = 0xcc; /* int3 poison */
1669
1670#elif RT_ARCH_ARM64
1671 /*
1672 * ARM64:
1673 */
1674 off = iemNativeEmitLoadGprFromGpr(pReNative, off, IEMNATIVE_CALL_ARG1_GREG, IEMNATIVE_CALL_RET_GREG);
1675 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1676 off = iemNativeEmitLoadGprFromGpr(pReNative, off, IEMNATIVE_CALL_ARG0_GREG, IEMNATIVE_REG_FIXED_PVMCPU);
1677 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1678 /* IEMNATIVE_CALL_ARG2_GREG is already set. */
1679 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_REG_FIXED_TMP0, (uintptr_t)iemNativeHlpExecStatusCodeFiddling);
1680 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1681
1682 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1683 AssertReturn(pu32CodeBuf, UINT32_MAX);
1684 pu32CodeBuf[off++] = Armv8A64MkInstrBlr(IEMNATIVE_REG_FIXED_TMP0);
1685
1686 /* Jump back to the common return point. */
1687 int32_t const offRel = pReNative->paLabels[idxReturnLabel].off - off;
1688 pu32CodeBuf[off++] = Armv8A64MkInstrB(offRel);
1689#else
1690# error "port me"
1691#endif
1692 }
1693 return off;
1694}
1695
1696
1697/**
1698 * Emits a standard epilog.
1699 */
1700static uint32_t iemNativeEmitEpilog(PIEMRECOMPILERSTATE pReNative, uint32_t off)
1701{
1702 /*
1703 * Successful return, so clear the return register (eax, w0).
1704 */
1705 off = iemNativeEmitGprZero(pReNative,off, IEMNATIVE_CALL_RET_GREG);
1706 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1707
1708 /*
1709 * Define label for common return point.
1710 */
1711 uint32_t const idxReturn = iemNativeMakeLabel(pReNative, kIemNativeLabelType_Return, off);
1712 AssertReturn(idxReturn != UINT32_MAX, UINT32_MAX);
1713
1714 /*
1715 * Restore registers and return.
1716 */
1717#ifdef RT_ARCH_AMD64
1718 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
1719 AssertReturn(pbCodeBuf, UINT32_MAX);
1720
1721 /* Reposition esp at the r15 restore point. */
1722 pbCodeBuf[off++] = X86_OP_REX_W;
1723 pbCodeBuf[off++] = 0x8d; /* lea rsp, [rbp - (gcc ? 5 : 7) * 8] */
1724 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM1, X86_GREG_xSP, X86_GREG_xBP);
1725 pbCodeBuf[off++] = (uint8_t)IEMNATIVE_FP_OFF_LAST_PUSH;
1726
1727 /* Pop non-volatile registers and return */
1728 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r15 */
1729 pbCodeBuf[off++] = 0x58 + X86_GREG_x15 - 8;
1730 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r14 */
1731 pbCodeBuf[off++] = 0x58 + X86_GREG_x14 - 8;
1732 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r13 */
1733 pbCodeBuf[off++] = 0x58 + X86_GREG_x13 - 8;
1734 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r12 */
1735 pbCodeBuf[off++] = 0x58 + X86_GREG_x12 - 8;
1736# ifdef RT_OS_WINDOWS
1737 pbCodeBuf[off++] = 0x58 + X86_GREG_xDI; /* pop rdi */
1738 pbCodeBuf[off++] = 0x58 + X86_GREG_xSI; /* pop rsi */
1739# endif
1740 pbCodeBuf[off++] = 0x58 + X86_GREG_xBX; /* pop rbx */
1741 pbCodeBuf[off++] = 0xc9; /* leave */
1742 pbCodeBuf[off++] = 0xc3; /* ret */
1743 pbCodeBuf[off++] = 0xcc; /* int3 poison */
1744
1745#elif RT_ARCH_ARM64
1746 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 10);
1747 AssertReturn(pu32CodeBuf, UINT32_MAX);
1748
1749 /* ldp x19, x20, [sp #IEMNATIVE_FRAME_VAR_SIZE]! ; Unallocate the variable space and restore x19+x20. */
1750 AssertCompile(IEMNATIVE_FRAME_VAR_SIZE < 64*8);
1751 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kPreIndex,
1752 ARMV8_A64_REG_X19, ARMV8_A64_REG_X20, ARMV8_A64_REG_SP,
1753 IEMNATIVE_FRAME_VAR_SIZE / 8);
1754 /* Restore x21 thru x28 + BP and LR (ret address) (SP remains unchanged in the kSigned variant). */
1755 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1756 ARMV8_A64_REG_X21, ARMV8_A64_REG_X22, ARMV8_A64_REG_SP, 2);
1757 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1758 ARMV8_A64_REG_X23, ARMV8_A64_REG_X24, ARMV8_A64_REG_SP, 4);
1759 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1760 ARMV8_A64_REG_X25, ARMV8_A64_REG_X26, ARMV8_A64_REG_SP, 6);
1761 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1762 ARMV8_A64_REG_X27, ARMV8_A64_REG_X28, ARMV8_A64_REG_SP, 8);
1763 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1764 ARMV8_A64_REG_BP, ARMV8_A64_REG_LR, ARMV8_A64_REG_SP, 10);
1765 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
1766
1767 /* add sp, sp, IEMNATIVE_FRAME_SAVE_REG_SIZE ; */
1768 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE < 4096);
1769 pu32CodeBuf[off++] = Armv8A64MkInstrAddSub(false /*fSub*/, ARMV8_A64_REG_SP, ARMV8_A64_REG_SP, IEMNATIVE_FRAME_SAVE_REG_SIZE);
1770
1771 /* retab / ret */
1772# ifdef RT_OS_DARWIN /** @todo See todo on pacibsp in the prolog. */
1773 if (1)
1774 pu32CodeBuf[off++] = ARMV8_A64_INSTR_RETAB;
1775 else
1776# endif
1777 pu32CodeBuf[off++] = ARMV8_A64_INSTR_RET;
1778
1779#else
1780# error "port me"
1781#endif
1782
1783 return iemNativeEmitRcFiddling(pReNative, off, idxReturn);
1784}
1785
1786
1787/**
1788 * Emits a standard prolog.
1789 */
1790static uint32_t iemNativeEmitProlog(PIEMRECOMPILERSTATE pReNative, uint32_t off)
1791{
1792#ifdef RT_ARCH_AMD64
1793 /*
1794 * Set up a regular xBP stack frame, pushing all non-volatile GPRs,
1795 * reserving 64 bytes for stack variables plus 4 non-register argument
1796 * slots. Fixed register assignment: xBX = pReNative;
1797 *
1798 * Since we always do the same register spilling, we can use the same
1799 * unwind description for all the code.
1800 */
1801 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 32);
1802 AssertReturn(pbCodeBuf, UINT32_MAX);
1803 pbCodeBuf[off++] = 0x50 + X86_GREG_xBP; /* push rbp */
1804 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbp, rsp */
1805 pbCodeBuf[off++] = 0x8b;
1806 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBP, X86_GREG_xSP);
1807 pbCodeBuf[off++] = 0x50 + X86_GREG_xBX; /* push rbx */
1808 AssertCompile(IEMNATIVE_REG_FIXED_PVMCPU == X86_GREG_xBX);
1809# ifdef RT_OS_WINDOWS
1810 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbx, rcx ; RBX = pVCpu */
1811 pbCodeBuf[off++] = 0x8b;
1812 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBX, X86_GREG_xCX);
1813 pbCodeBuf[off++] = 0x50 + X86_GREG_xSI; /* push rsi */
1814 pbCodeBuf[off++] = 0x50 + X86_GREG_xDI; /* push rdi */
1815# else
1816 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbx, rdi ; RBX = pVCpu */
1817 pbCodeBuf[off++] = 0x8b;
1818 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBX, X86_GREG_xDI);
1819# endif
1820 pbCodeBuf[off++] = X86_OP_REX_B; /* push r12 */
1821 pbCodeBuf[off++] = 0x50 + X86_GREG_x12 - 8;
1822 pbCodeBuf[off++] = X86_OP_REX_B; /* push r13 */
1823 pbCodeBuf[off++] = 0x50 + X86_GREG_x13 - 8;
1824 pbCodeBuf[off++] = X86_OP_REX_B; /* push r14 */
1825 pbCodeBuf[off++] = 0x50 + X86_GREG_x14 - 8;
1826 pbCodeBuf[off++] = X86_OP_REX_B; /* push r15 */
1827 pbCodeBuf[off++] = 0x50 + X86_GREG_x15 - 8;
1828
1829 off = iemNativeEmitSubGprImm(pReNative, off, /* sub rsp, byte 28h */
1830 X86_GREG_xSP,
1831 IEMNATIVE_FRAME_ALIGN_SIZE
1832 + IEMNATIVE_FRAME_VAR_SIZE
1833 + IEMNATIVE_FRAME_STACK_ARG_COUNT * 8
1834 + IEMNATIVE_FRAME_SHADOW_ARG_COUNT * 8);
1835 AssertCompile(!(IEMNATIVE_FRAME_VAR_SIZE & 0xf));
1836 AssertCompile(!(IEMNATIVE_FRAME_STACK_ARG_COUNT & 0x1));
1837 AssertCompile(!(IEMNATIVE_FRAME_SHADOW_ARG_COUNT & 0x1));
1838
1839#elif RT_ARCH_ARM64
1840 /*
1841 * We set up a stack frame exactly like on x86, only we have to push the
1842 * return address our selves here. We save all non-volatile registers.
1843 */
1844 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 10);
1845 AssertReturn(pu32CodeBuf, UINT32_MAX);
1846
1847# ifdef RT_OS_DARWIN /** @todo This seems to be requirement by libunwind for JIT FDEs. Investigate further as been unable
1848 * to figure out where the BRK following AUTHB*+XPACB* stuff comes from in libunwind. It's
1849 * definitely the dwarf stepping code, but till found it's very tedious to figure out whether it's
1850 * in any way conditional, so just emitting this instructions now and hoping for the best... */
1851 /* pacibsp */
1852 pu32CodeBuf[off++] = ARMV8_A64_INSTR_PACIBSP;
1853# endif
1854
1855 /* stp x19, x20, [sp, #-IEMNATIVE_FRAME_SAVE_REG_SIZE] ; Allocate space for saving registers and place x19+x20 at the bottom. */
1856 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE < 64*8);
1857 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kPreIndex,
1858 ARMV8_A64_REG_X19, ARMV8_A64_REG_X20, ARMV8_A64_REG_SP,
1859 -IEMNATIVE_FRAME_SAVE_REG_SIZE / 8);
1860 /* Save x21 thru x28 (SP remains unchanged in the kSigned variant). */
1861 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1862 ARMV8_A64_REG_X21, ARMV8_A64_REG_X22, ARMV8_A64_REG_SP, 2);
1863 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1864 ARMV8_A64_REG_X23, ARMV8_A64_REG_X24, ARMV8_A64_REG_SP, 4);
1865 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1866 ARMV8_A64_REG_X25, ARMV8_A64_REG_X26, ARMV8_A64_REG_SP, 6);
1867 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1868 ARMV8_A64_REG_X27, ARMV8_A64_REG_X28, ARMV8_A64_REG_SP, 8);
1869 /* Save the BP and LR (ret address) registers at the top of the frame. */
1870 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1871 ARMV8_A64_REG_BP, ARMV8_A64_REG_LR, ARMV8_A64_REG_SP, 10);
1872 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
1873 /* add bp, sp, IEMNATIVE_FRAME_SAVE_REG_SIZE - 16 ; Set BP to point to the old BP stack address. */
1874 pu32CodeBuf[off++] = Armv8A64MkInstrAddSub(false /*fSub*/, ARMV8_A64_REG_BP,
1875 ARMV8_A64_REG_SP, IEMNATIVE_FRAME_SAVE_REG_SIZE - 16);
1876
1877 /* sub sp, sp, IEMNATIVE_FRAME_VAR_SIZE ; Allocate the variable area from SP. */
1878 pu32CodeBuf[off++] = Armv8A64MkInstrAddSub(true /*fSub*/, ARMV8_A64_REG_SP, ARMV8_A64_REG_SP, IEMNATIVE_FRAME_VAR_SIZE);
1879
1880 /* mov r28, r0 */
1881 off = iemNativeEmitLoadGprFromGpr(pReNative, off, IEMNATIVE_REG_FIXED_PVMCPU, IEMNATIVE_CALL_ARG0_GREG);
1882
1883#else
1884# error "port me"
1885#endif
1886 return off;
1887}
1888
1889
1890DECLINLINE(uint32_t) iemNativeEmitCImplCall1(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxInstr,
1891 uintptr_t pfnCImpl, uint8_t cbInstr, uint64_t uArg0)
1892{
1893 return iemNativeEmitCImplCall(pReNative, off, idxInstr, pfnCImpl, cbInstr, 1, uArg0, 0, 0);
1894}
1895
1896
1897DECLINLINE(uint32_t) iemNativeEmitCImplCall2(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxInstr,
1898 uintptr_t pfnCImpl, uint8_t cbInstr, uint64_t uArg0, uint64_t uArg1)
1899{
1900 return iemNativeEmitCImplCall(pReNative, off, idxInstr, pfnCImpl, cbInstr, 2, uArg0, uArg1, 0);
1901}
1902
1903
1904DECLINLINE(uint32_t) iemNativeEmitCImplCall3(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxInstr,
1905 uintptr_t pfnCImpl, uint8_t cbInstr, uint64_t uArg0, uint64_t uArg1, uint64_t uArg2)
1906{
1907 return iemNativeEmitCImplCall(pReNative, off, idxInstr, pfnCImpl, cbInstr, 3, uArg0, uArg1, uArg2);
1908}
1909
1910
1911#if 0
1912/** Same as iemRegFinishClearingRF. */
1913DECLINLINE(uint32_t) iemNativeEmitFinishClearingRF(PIEMRECOMPILERSTATE pReNative, uint32_t off)
1914{
1915 RT_NOREF(pReNative, off);
1916#if 0
1917 uint32_t const fFlags = pReNative->pTbOrg->fFlags;
1918 if (fFlags & IEMTB_F_INHIBIT_SHADOW)
1919 {
1920 }
1921 IEMTB_F_IEM_F_MASK
1922
1923 //
1924 if (RT_LIKELY(!( pVCpu->cpum.GstCtx.eflags.uBoth
1925 & (X86_EFL_TF | X86_EFL_RF | CPUMCTX_INHIBIT_SHADOW | CPUMCTX_DBG_HIT_DRX_MASK | CPUMCTX_DBG_DBGF_MASK)) ))
1926 return VINF_SUCCESS;
1927 return iemFinishInstructionWithFlagsSet(pVCpu);
1928#else
1929 return UINT32_MAX;
1930#endif
1931}
1932
1933
1934/** Same as iemRegAddToEip32AndFinishingClearingRF. */
1935DECLINLINE(uint32_t) iemNativeEmitAddToEip32AndFinishingClearingRF(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t cbInstr)
1936{
1937 /* Increment RIP. */
1938 pVCpu->cpum.GstCtx.rip = (uint32_t)(pVCpu->cpum.GstCtx.eip + cbInstr);
1939
1940 /* Consider flags. */
1941 return iemNativeEmitFinishClearingRF(pReNative, off);
1942}
1943#endif
1944
1945/*
1946 * MC definitions for the native recompiler.
1947 */
1948
1949#define IEM_MC_DEFER_TO_CIMPL_0_RET_THREADED(a_cbInstr, a_fFlags, a_pfnCImpl) \
1950 return iemNativeEmitCImplCall0(pReNative, off, pCallEntry->idxInstr, (uintptr_t)a_pfnCImpl, a_cbInstr) /** @todo not used ... */
1951
1952#define IEM_MC_DEFER_TO_CIMPL_1_RET_THREADED(a_cbInstr, a_fFlags, a_pfnCImpl, a0) \
1953 return iemNativeEmitCImplCall1(pReNative, off, pCallEntry->idxInstr, (uintptr_t)a_pfnCImpl, a_cbInstr, a0)
1954
1955#define IEM_MC_DEFER_TO_CIMPL_2_RET_THREADED(a_cbInstr, a_fFlags, a_pfnCImpl, a0, a1) \
1956 return iemNativeEmitCImplCall2(pReNative, off, pCallEntry->idxInstr, (uintptr_t)a_pfnCImpl, a_cbInstr, a0, a1)
1957
1958#define IEM_MC_DEFER_TO_CIMPL_3_RET_THREADED(a_cbInstr, a_fFlags, a_pfnCImpl, a0, a1, a2) \
1959 return iemNativeEmitCImplCall3(pReNative, off, pCallEntry->idxInstr, (uintptr_t)a_pfnCImpl, a_cbInstr, a0, a1, a2)
1960
1961
1962#define IEM_MC_BEGIN(a_cArgs, a_cLocals, a_fFlags) {
1963
1964#define IEM_MC_END() } AssertFailedReturn(UINT32_MAX /* shouldn't be reached! */)
1965
1966#define IEM_MC_ADVANCE_RIP_AND_FINISH_THREADED_PC16(a_cbInstr) \
1967 return iemNativeEmitAddToIp16AndFinishingClearingRF(pReNative, off, a_cbInstr)
1968
1969#define IEM_MC_ADVANCE_RIP_AND_FINISH_THREADED_PC32(a_cbInstr) \
1970 return iemNativeEmitAddToEip32AndFinishingClearingRF(pReNative, off, a_cbInstr)
1971
1972#define IEM_MC_ADVANCE_RIP_AND_FINISH_THREADED_PC64(a_cbInstr) \
1973 return iemNativeEmitAddToRip64AndFinishingClearingRF(pReNative, off, a_cbInstr)
1974
1975
1976/*
1977 * Builtin functions.
1978 */
1979
1980/**
1981 * Built-in function that calls a C-implemention function taking zero arguments.
1982 */
1983static IEM_DECL_IEMNATIVERECOMPFUNC_DEF(iemNativeRecompFunc_BltIn_DeferToCImpl0)
1984{
1985 PFNIEMCIMPL0 const pfnCImpl = (PFNIEMCIMPL0)(uintptr_t)pCallEntry->auParams[0];
1986 uint8_t const cbInstr = (uint8_t)pCallEntry->auParams[1];
1987 return iemNativeEmitCImplCall(pReNative, off, pCallEntry->idxInstr, (uintptr_t)pfnCImpl, cbInstr, 0, 0, 0, 0);
1988}
1989
1990
1991
1992/*
1993 * Include g_apfnIemNativeRecompileFunctions and associated functions.
1994 *
1995 * This should probably live in it's own file later, but lets see what the
1996 * compile times turn out to be first.
1997 */
1998#include "IEMNativeFunctions.cpp.h"
1999
2000
2001/**
2002 * Recompiles the given threaded TB into a native one.
2003 *
2004 * In case of failure the translation block will be returned as-is.
2005 *
2006 * @returns pTb.
2007 * @param pVCpu The cross context virtual CPU structure of the calling
2008 * thread.
2009 * @param pTb The threaded translation to recompile to native.
2010 */
2011PIEMTB iemNativeRecompile(PVMCPUCC pVCpu, PIEMTB pTb)
2012{
2013 /*
2014 * The first time thru, we allocate the recompiler state, the other times
2015 * we just need to reset it before using it again.
2016 */
2017 PIEMRECOMPILERSTATE pReNative = pVCpu->iem.s.pNativeRecompilerStateR3;
2018 if (RT_LIKELY(pReNative))
2019 iemNativeReInit(pReNative, pTb);
2020 else
2021 {
2022 pReNative = iemNativeInit(pVCpu, pTb);
2023 AssertReturn(pReNative, pTb);
2024 }
2025
2026 /*
2027 * Emit prolog code (fixed).
2028 */
2029 uint32_t off = iemNativeEmitProlog(pReNative, 0);
2030 AssertReturn(off != UINT32_MAX, pTb);
2031
2032 /*
2033 * Convert the calls to native code.
2034 */
2035 PCIEMTHRDEDCALLENTRY pCallEntry = pTb->Thrd.paCalls;
2036 uint32_t cCallsLeft = pTb->Thrd.cCalls;
2037 while (cCallsLeft-- > 0)
2038 {
2039 PFNIEMNATIVERECOMPFUNC const pfnRecom = g_apfnIemNativeRecompileFunctions[pCallEntry->enmFunction];
2040 if (pfnRecom) /** @todo stats on this. */
2041 off = pfnRecom(pReNative, off, pCallEntry);
2042 else
2043 off = iemNativeEmitThreadedCall(pReNative, off, pCallEntry);
2044 AssertReturn(off != UINT32_MAX, pTb);
2045
2046 pCallEntry++;
2047 }
2048
2049 /*
2050 * Emit the epilog code.
2051 */
2052 off = iemNativeEmitEpilog(pReNative, off);
2053 AssertReturn(off != UINT32_MAX, pTb);
2054
2055 /*
2056 * Make sure all labels has been defined.
2057 */
2058 PIEMNATIVELABEL const paLabels = pReNative->paLabels;
2059#ifdef VBOX_STRICT
2060 uint32_t const cLabels = pReNative->cLabels;
2061 for (uint32_t i = 0; i < cLabels; i++)
2062 AssertMsgReturn(paLabels[i].off < off, ("i=%d enmType=%d\n", i, paLabels[i].enmType), pTb);
2063#endif
2064
2065 /*
2066 * Allocate executable memory, copy over the code we've generated.
2067 */
2068 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
2069 if (pTbAllocator->pDelayedFreeHead)
2070 iemTbAllocatorProcessDelayedFrees(pVCpu, pVCpu->iem.s.pTbAllocatorR3);
2071
2072 PIEMNATIVEINSTR const paFinalInstrBuf = (PIEMNATIVEINSTR)iemExecMemAllocatorAlloc(pVCpu, off * sizeof(IEMNATIVEINSTR));
2073 AssertReturn(paFinalInstrBuf, pTb);
2074 memcpy(paFinalInstrBuf, pReNative->pInstrBuf, off * sizeof(paFinalInstrBuf[0]));
2075
2076 /*
2077 * Apply fixups.
2078 */
2079 PIEMNATIVEFIXUP const paFixups = pReNative->paFixups;
2080 uint32_t const cFixups = pReNative->cFixups;
2081 for (uint32_t i = 0; i < cFixups; i++)
2082 {
2083 Assert(paFixups[i].off < off);
2084 Assert(paFixups[i].idxLabel < cLabels);
2085 RTPTRUNION const Ptr = { &paFinalInstrBuf[paFixups[i].off] };
2086 switch (paFixups[i].enmType)
2087 {
2088#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2089 case kIemNativeFixupType_Rel32:
2090 Assert(paFixups[i].off + 4 <= off);
2091 *Ptr.pi32 = paLabels[paFixups[i].idxLabel].off - paFixups[i].off + paFixups[i].offAddend;
2092 continue;
2093
2094#elif defined(RT_ARCH_ARM64)
2095 case kIemNativeFixupType_RelImm19At5:
2096 {
2097 Assert(paFixups[i].off < off);
2098 int32_t const offDisp = paLabels[paFixups[i].idxLabel].off - paFixups[i].off + paFixups[i].offAddend;
2099 Assert(offDisp >= -262144 && offDisp < 262144);
2100 *Ptr.pu32 = (*Ptr.pu32 & UINT32_C(0xff00001f)) | (offDisp << 5);
2101 continue;
2102 }
2103#endif
2104 case kIemNativeFixupType_Invalid:
2105 case kIemNativeFixupType_End:
2106 break;
2107 }
2108 AssertFailed();
2109 }
2110
2111 iemExecMemAllocatorReadyForUse(pVCpu, paFinalInstrBuf, off * sizeof(IEMNATIVEINSTR));
2112
2113 /*
2114 * Convert the translation block.
2115 */
2116 //RT_BREAKPOINT();
2117 RTMemFree(pTb->Thrd.paCalls);
2118 pTb->Native.paInstructions = paFinalInstrBuf;
2119 pTb->Native.cInstructions = off;
2120 pTb->fFlags = (pTb->fFlags & ~IEMTB_F_TYPE_MASK) | IEMTB_F_TYPE_NATIVE;
2121
2122 Assert(pTbAllocator->cThreadedTbs > 0);
2123 pTbAllocator->cThreadedTbs -= 1;
2124 pTbAllocator->cNativeTbs += 1;
2125 Assert(pTbAllocator->cNativeTbs <= pTbAllocator->cTotalTbs);
2126
2127 return pTb;
2128}
2129
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette