VirtualBox

source: vbox/trunk/src/VBox/VMM/VMMAll/IEMAllN8veRecompiler.cpp@ 101370

最後變更 在這個檔案從101370是 101370,由 vboxsync 提交於 18 月 前

Reverted commit r159372, committed too much.

  • 屬性 svn:eol-style 設為 native
  • 屬性 svn:keywords 設為 Author Date Id Revision
檔案大小: 82.0 KB
 
1/* $Id: IEMAllN8veRecompiler.cpp 101370 2023-10-06 01:23:09Z vboxsync $ */
2/** @file
3 * IEM - Native Recompiler
4 *
5 * Logging group IEM_RE_NATIVE assignments:
6 * - Level 1 (Log) : ...
7 * - Flow (LogFlow) : ...
8 * - Level 2 (Log2) : ...
9 * - Level 3 (Log3) : ...
10 * - Level 4 (Log4) : ...
11 * - Level 5 (Log5) : ...
12 * - Level 6 (Log6) : ...
13 * - Level 7 (Log7) : ...
14 * - Level 8 (Log8) : ...
15 * - Level 9 (Log9) : ...
16 * - Level 10 (Log10): ...
17 * - Level 11 (Log11): ...
18 * - Level 12 (Log12): ...
19 */
20
21/*
22 * Copyright (C) 2023 Oracle and/or its affiliates.
23 *
24 * This file is part of VirtualBox base platform packages, as
25 * available from https://www.alldomusa.eu.org.
26 *
27 * This program is free software; you can redistribute it and/or
28 * modify it under the terms of the GNU General Public License
29 * as published by the Free Software Foundation, in version 3 of the
30 * License.
31 *
32 * This program is distributed in the hope that it will be useful, but
33 * WITHOUT ANY WARRANTY; without even the implied warranty of
34 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
35 * General Public License for more details.
36 *
37 * You should have received a copy of the GNU General Public License
38 * along with this program; if not, see <https://www.gnu.org/licenses>.
39 *
40 * SPDX-License-Identifier: GPL-3.0-only
41 */
42
43
44/*********************************************************************************************************************************
45* Header Files *
46*********************************************************************************************************************************/
47#define LOG_GROUP LOG_GROUP_IEM_RE_THREADED
48#define IEM_WITH_OPAQUE_DECODER_STATE
49#define VMCPU_INCL_CPUM_GST_CTX
50#include <VBox/vmm/iem.h>
51#include <VBox/vmm/cpum.h>
52#include "IEMInternal.h"
53#include <VBox/vmm/vmcc.h>
54#include <VBox/log.h>
55#include <VBox/err.h>
56#include <VBox/param.h>
57#include <iprt/assert.h>
58#include <iprt/heap.h>
59#include <iprt/mem.h>
60#include <iprt/string.h>
61#if defined(RT_ARCH_AMD64)
62# include <iprt/x86.h>
63#elif defined(RT_ARCH_ARM64)
64# include <iprt/armv8.h>
65#endif
66
67#ifdef RT_OS_WINDOWS
68# include <iprt/formats/pecoff.h> /* this is incomaptible with windows.h, thus: */
69extern "C" DECLIMPORT(uint8_t) __cdecl RtlAddFunctionTable(void *pvFunctionTable, uint32_t cEntries, uintptr_t uBaseAddress);
70extern "C" DECLIMPORT(uint8_t) __cdecl RtlDelFunctionTable(void *pvFunctionTable);
71#else
72# include <iprt/formats/dwarf.h>
73# if defined(RT_OS_DARWIN)
74# include <libkern/OSCacheControl.h>
75# define IEMNATIVE_USE_LIBUNWIND
76extern "C" void __register_frame(const void *pvFde);
77extern "C" void __deregister_frame(const void *pvFde);
78# else
79extern "C" void __register_frame_info(void *pvBegin, void *pvObj); /* found no header for these two */
80extern "C" void *__deregister_frame_info(void *pvBegin); /* (returns pvObj from __register_frame_info call) */
81# endif
82#endif
83
84#include "IEMInline.h"
85#include "IEMThreadedFunctions.h"
86#include "IEMN8veRecompiler.h"
87#include "IEMNativeFunctions.h"
88
89
90/*
91 * Narrow down configs here to avoid wasting time on unused configs here.
92 * Note! Same checks in IEMAllThrdRecompiler.cpp.
93 */
94
95#ifndef IEM_WITH_CODE_TLB
96# error The code TLB must be enabled for the recompiler.
97#endif
98
99#ifndef IEM_WITH_DATA_TLB
100# error The data TLB must be enabled for the recompiler.
101#endif
102
103#ifndef IEM_WITH_SETJMP
104# error The setjmp approach must be enabled for the recompiler.
105#endif
106
107
108/*********************************************************************************************************************************
109* Executable Memory Allocator *
110*********************************************************************************************************************************/
111/** @def IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
112 * Use an alternative chunk sub-allocator that does store internal data
113 * in the chunk.
114 *
115 * Using the RTHeapSimple is not practial on newer darwin systems where
116 * RTMEM_PROT_WRITE and RTMEM_PROT_EXEC are mutually exclusive in process
117 * memory. We would have to change the protection of the whole chunk for
118 * every call to RTHeapSimple, which would be rather expensive.
119 *
120 * This alternative implemenation let restrict page protection modifications
121 * to the pages backing the executable memory we just allocated.
122 */
123#define IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
124/** The chunk sub-allocation unit size in bytes. */
125#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE 128
126/** The chunk sub-allocation unit size as a shift factor. */
127#define IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT 7
128
129#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
130/**
131 * Per-chunk unwind info for non-windows hosts.
132 */
133typedef struct IEMEXECMEMCHUNKEHFRAME
134{
135# ifdef IEMNATIVE_USE_LIBUNWIND
136 /** The offset of the FDA into abEhFrame. */
137 uintptr_t offFda;
138# else
139 /** struct object storage area. */
140 uint8_t abObject[1024];
141# endif
142 /** The dwarf ehframe data for the chunk. */
143 uint8_t abEhFrame[512];
144} IEMEXECMEMCHUNKEHFRAME;
145/** Pointer to per-chunk info info for non-windows hosts. */
146typedef IEMEXECMEMCHUNKEHFRAME *PIEMEXECMEMCHUNKEHFRAME;
147#endif
148
149
150/**
151 * An chunk of executable memory.
152 */
153typedef struct IEMEXECMEMCHUNK
154{
155#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
156 /** Number of free items in this chunk. */
157 uint32_t cFreeUnits;
158 /** Hint were to start searching for free space in the allocation bitmap. */
159 uint32_t idxFreeHint;
160#else
161 /** The heap handle. */
162 RTHEAPSIMPLE hHeap;
163#endif
164 /** Pointer to the chunk. */
165 void *pvChunk;
166#ifdef IN_RING3
167 /**
168 * Pointer to the unwind information.
169 *
170 * This is used during C++ throw and longjmp (windows and probably most other
171 * platforms). Some debuggers (windbg) makes use of it as well.
172 *
173 * Windows: This is allocated from hHeap on windows because (at least for
174 * AMD64) the UNWIND_INFO structure address in the
175 * RUNTIME_FUNCTION entry is an RVA and the chunk is the "image".
176 *
177 * Others: Allocated from the regular heap to avoid unnecessary executable data
178 * structures. This points to an IEMEXECMEMCHUNKEHFRAME structure. */
179 void *pvUnwindInfo;
180#elif defined(IN_RING0)
181 /** Allocation handle. */
182 RTR0MEMOBJ hMemObj;
183#endif
184} IEMEXECMEMCHUNK;
185/** Pointer to a memory chunk. */
186typedef IEMEXECMEMCHUNK *PIEMEXECMEMCHUNK;
187
188
189/**
190 * Executable memory allocator for the native recompiler.
191 */
192typedef struct IEMEXECMEMALLOCATOR
193{
194 /** Magic value (IEMEXECMEMALLOCATOR_MAGIC). */
195 uint32_t uMagic;
196
197 /** The chunk size. */
198 uint32_t cbChunk;
199 /** The maximum number of chunks. */
200 uint32_t cMaxChunks;
201 /** The current number of chunks. */
202 uint32_t cChunks;
203 /** Hint where to start looking for available memory. */
204 uint32_t idxChunkHint;
205 /** Statistics: Current number of allocations. */
206 uint32_t cAllocations;
207
208 /** The total amount of memory available. */
209 uint64_t cbTotal;
210 /** Total amount of free memory. */
211 uint64_t cbFree;
212 /** Total amount of memory allocated. */
213 uint64_t cbAllocated;
214
215#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
216 /** Pointer to the allocation bitmaps for all the chunks (follows aChunks).
217 *
218 * Since the chunk size is a power of two and the minimum chunk size is a lot
219 * higher than the IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE, each chunk will always
220 * require a whole number of uint64_t elements in the allocation bitmap. So,
221 * for sake of simplicity, they are allocated as one continous chunk for
222 * simplicity/laziness. */
223 uint64_t *pbmAlloc;
224 /** Number of units (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE) per chunk. */
225 uint32_t cUnitsPerChunk;
226 /** Number of bitmap elements per chunk (for quickly locating the bitmap
227 * portion corresponding to an chunk). */
228 uint32_t cBitmapElementsPerChunk;
229#else
230 /** @name Tweaks to get 64 byte aligned allocats w/o unnecessary fragmentation.
231 * @{ */
232 /** The size of the heap internal block header. This is used to adjust the
233 * request memory size to make sure there is exacly enough room for a header at
234 * the end of the blocks we allocate before the next 64 byte alignment line. */
235 uint32_t cbHeapBlockHdr;
236 /** The size of initial heap allocation required make sure the first
237 * allocation is correctly aligned. */
238 uint32_t cbHeapAlignTweak;
239 /** The alignment tweak allocation address. */
240 void *pvAlignTweak;
241 /** @} */
242#endif
243
244#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
245 /** Pointer to the array of unwind info running parallel to aChunks (same
246 * allocation as this structure, located after the bitmaps).
247 * (For Windows, the structures must reside in 32-bit RVA distance to the
248 * actual chunk, so they are allocated off the chunk.) */
249 PIEMEXECMEMCHUNKEHFRAME paEhFrames;
250#endif
251
252 /** The allocation chunks. */
253 RT_FLEXIBLE_ARRAY_EXTENSION
254 IEMEXECMEMCHUNK aChunks[RT_FLEXIBLE_ARRAY];
255} IEMEXECMEMALLOCATOR;
256/** Pointer to an executable memory allocator. */
257typedef IEMEXECMEMALLOCATOR *PIEMEXECMEMALLOCATOR;
258
259/** Magic value for IEMEXECMEMALLOCATOR::uMagic (Scott Frederick Turow). */
260#define IEMEXECMEMALLOCATOR_MAGIC UINT32_C(0x19490412)
261
262
263static int iemExecMemAllocatorGrow(PIEMEXECMEMALLOCATOR pExecMemAllocator);
264
265
266/**
267 * Worker for iemExecMemAllocatorAlloc that returns @a pvRet after updating
268 * the heap statistics.
269 */
270static void * iemExecMemAllocatorAllocTailCode(PIEMEXECMEMALLOCATOR pExecMemAllocator, void *pvRet,
271 uint32_t cbReq, uint32_t idxChunk)
272{
273 pExecMemAllocator->cAllocations += 1;
274 pExecMemAllocator->cbAllocated += cbReq;
275#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
276 pExecMemAllocator->cbFree -= cbReq;
277#else
278 pExecMemAllocator->cbFree -= RT_ALIGN_32(cbReq, 64);
279#endif
280 pExecMemAllocator->idxChunkHint = idxChunk;
281
282#ifdef RT_OS_DARWIN
283 /*
284 * Sucks, but RTMEM_PROT_EXEC and RTMEM_PROT_WRITE are mutually exclusive
285 * on darwin. So, we mark the pages returned as read+write after alloc and
286 * expect the caller to call iemExecMemAllocatorReadyForUse when done
287 * writing to the allocation.
288 *
289 * See also https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
290 * for details.
291 */
292 /** @todo detect if this is necessary... it wasn't required on 10.15 or
293 * whatever older version it was. */
294 int rc = RTMemProtect(pvRet, cbReq, RTMEM_PROT_WRITE | RTMEM_PROT_READ);
295 AssertRC(rc);
296#endif
297
298 return pvRet;
299}
300
301
302#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
303static void *iemExecMemAllocatorAllocInChunkInt(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint64_t *pbmAlloc, uint32_t idxFirst,
304 uint32_t cToScan, uint32_t cReqUnits, uint32_t idxChunk)
305{
306 /*
307 * Shift the bitmap to the idxFirst bit so we can use ASMBitFirstClear.
308 */
309 Assert(!(cToScan & 63));
310 Assert(!(idxFirst & 63));
311 Assert(cToScan + idxFirst <= pExecMemAllocator->cUnitsPerChunk);
312 pbmAlloc += idxFirst / 64;
313
314 /*
315 * Scan the bitmap for cReqUnits of consequtive clear bits
316 */
317 /** @todo This can probably be done more efficiently for non-x86 systems. */
318 int iBit = ASMBitFirstClear(pbmAlloc, cToScan);
319 while (iBit >= 0 && (uint32_t)iBit <= cToScan - cReqUnits)
320 {
321 uint32_t idxAddBit = 1;
322 while (idxAddBit < cReqUnits && !ASMBitTest(pbmAlloc, (uint32_t)iBit + idxAddBit))
323 idxAddBit++;
324 if (idxAddBit >= cReqUnits)
325 {
326 ASMBitSetRange(pbmAlloc, (uint32_t)iBit, (uint32_t)iBit + cReqUnits);
327
328 PIEMEXECMEMCHUNK const pChunk = &pExecMemAllocator->aChunks[idxChunk];
329 pChunk->cFreeUnits -= cReqUnits;
330 pChunk->idxFreeHint = (uint32_t)iBit + cReqUnits;
331
332 void * const pvRet = (uint8_t *)pChunk->pvChunk
333 + ((idxFirst + (uint32_t)iBit) << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT);
334
335 return iemExecMemAllocatorAllocTailCode(pExecMemAllocator, pvRet,
336 cReqUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT, idxChunk);
337 }
338
339 iBit = ASMBitNextClear(pbmAlloc, cToScan, iBit + idxAddBit - 1);
340 }
341 return NULL;
342}
343#endif /* IEMEXECMEM_USE_ALT_SUB_ALLOCATOR */
344
345
346static void *iemExecMemAllocatorAllocInChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, uint32_t idxChunk, uint32_t cbReq)
347{
348#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
349 /*
350 * Figure out how much to allocate.
351 */
352 uint32_t const cReqUnits = (cbReq + IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1) >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
353 if (cReqUnits <= pExecMemAllocator->aChunks[idxChunk].cFreeUnits)
354 {
355 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
356 uint32_t const idxHint = pExecMemAllocator->aChunks[idxChunk].idxFreeHint & ~(uint32_t)63;
357 if (idxHint + cReqUnits <= pExecMemAllocator->cUnitsPerChunk)
358 {
359 void *pvRet = iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, idxHint,
360 pExecMemAllocator->cUnitsPerChunk - idxHint, cReqUnits, idxChunk);
361 if (pvRet)
362 return pvRet;
363 }
364 return iemExecMemAllocatorAllocInChunkInt(pExecMemAllocator, pbmAlloc, 0,
365 RT_MIN(pExecMemAllocator->cUnitsPerChunk, RT_ALIGN_32(idxHint + cReqUnits, 64)),
366 cReqUnits, idxChunk);
367 }
368#else
369 void *pvRet = RTHeapSimpleAlloc(pExecMemAllocator->aChunks[idxChunk].hHeap, cbReq, 32);
370 if (pvRet)
371 return iemExecMemAllocatorAllocTailCode(pExecMemAllocator, pvRet, cbReq, idxChunk);
372#endif
373 return NULL;
374
375}
376
377
378/**
379 * Allocates @a cbReq bytes of executable memory.
380 *
381 * @returns Pointer to the memory, NULL if out of memory or other problem
382 * encountered.
383 * @param pVCpu The cross context virtual CPU structure of the calling
384 * thread.
385 * @param cbReq How many bytes are required.
386 */
387static void *iemExecMemAllocatorAlloc(PVMCPU pVCpu, uint32_t cbReq)
388{
389 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
390 AssertReturn(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC, NULL);
391 AssertMsgReturn(cbReq > 32 && cbReq < _512K, ("%#x\n", cbReq), NULL);
392
393 /*
394 * Adjust the request size so it'll fit the allocator alignment/whatnot.
395 *
396 * For the RTHeapSimple allocator this means to follow the logic described
397 * in iemExecMemAllocatorGrow and attempt to allocate it from one of the
398 * existing chunks if we think we've got sufficient free memory around.
399 *
400 * While for the alternative one we just align it up to a whole unit size.
401 */
402#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
403 cbReq = RT_ALIGN_32(cbReq, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
404#else
405 cbReq = RT_ALIGN_32(cbReq + pExecMemAllocator->cbHeapBlockHdr, 64) - pExecMemAllocator->cbHeapBlockHdr;
406#endif
407 if (cbReq <= pExecMemAllocator->cbFree)
408 {
409 uint32_t const cChunks = pExecMemAllocator->cChunks;
410 uint32_t const idxChunkHint = pExecMemAllocator->idxChunkHint < cChunks ? pExecMemAllocator->idxChunkHint : 0;
411 for (uint32_t idxChunk = idxChunkHint; idxChunk < cChunks; idxChunk++)
412 {
413 void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq);
414 if (pvRet)
415 return pvRet;
416 }
417 for (uint32_t idxChunk = 0; idxChunk < idxChunkHint; idxChunk++)
418 {
419 void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq);
420 if (pvRet)
421 return pvRet;
422 }
423 }
424
425 /*
426 * Can we grow it with another chunk?
427 */
428 if (pExecMemAllocator->cChunks < pExecMemAllocator->cMaxChunks)
429 {
430 int rc = iemExecMemAllocatorGrow(pExecMemAllocator);
431 AssertLogRelRCReturn(rc, NULL);
432
433 uint32_t const idxChunk = pExecMemAllocator->cChunks - 1;
434 void *pvRet = iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbReq);
435 if (pvRet)
436 return pvRet;
437 AssertFailed();
438 }
439
440 /* What now? Prune native translation blocks from the cache? */
441 AssertFailed();
442 return NULL;
443}
444
445
446/** This is a hook that we may need later for changing memory protection back
447 * to readonly+exec */
448static void iemExecMemAllocatorReadyForUse(PVMCPUCC pVCpu, void *pv, size_t cb)
449{
450#ifdef RT_OS_DARWIN
451 /* See iemExecMemAllocatorAllocTailCode for the explanation. */
452 int rc = RTMemProtect(pv, cb, RTMEM_PROT_EXEC | RTMEM_PROT_READ);
453 AssertRC(rc); RT_NOREF(pVCpu);
454
455 /*
456 * Flush the instruction cache:
457 * https://developer.apple.com/documentation/apple-silicon/porting-just-in-time-compilers-to-apple-silicon
458 */
459 /* sys_dcache_flush(pv, cb); - not necessary */
460 sys_icache_invalidate(pv, cb);
461#else
462 RT_NOREF(pVCpu, pv, cb);
463#endif
464}
465
466
467/**
468 * Frees executable memory.
469 */
470void iemExecMemAllocatorFree(PVMCPU pVCpu, void *pv, size_t cb)
471{
472 PIEMEXECMEMALLOCATOR pExecMemAllocator = pVCpu->iem.s.pExecMemAllocatorR3;
473 Assert(pExecMemAllocator && pExecMemAllocator->uMagic == IEMEXECMEMALLOCATOR_MAGIC);
474 Assert(pv);
475#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
476 Assert(!((uintptr_t)pv & (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE - 1)));
477#else
478 Assert(!((uintptr_t)pv & 63));
479#endif
480
481 /* Align the size as we did when allocating the block. */
482#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
483 cb = RT_ALIGN_Z(cb, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
484#else
485 cb = RT_ALIGN_Z(cb + pExecMemAllocator->cbHeapBlockHdr, 64) - pExecMemAllocator->cbHeapBlockHdr;
486#endif
487
488 /* Free it / assert sanity. */
489#if defined(VBOX_STRICT) || defined(IEMEXECMEM_USE_ALT_SUB_ALLOCATOR)
490 uint32_t const cChunks = pExecMemAllocator->cChunks;
491 uint32_t const cbChunk = pExecMemAllocator->cbChunk;
492 bool fFound = false;
493 for (uint32_t idxChunk = 0; idxChunk < cChunks; idxChunk++)
494 {
495 uintptr_t const offChunk = (uintptr_t)pv - (uintptr_t)pExecMemAllocator->aChunks[idxChunk].pvChunk;
496 fFound = offChunk < cbChunk;
497 if (fFound)
498 {
499#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
500 uint32_t const idxFirst = offChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
501 uint32_t const cReqUnits = cb >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
502
503 /* Check that it's valid and free it. */
504 uint64_t * const pbmAlloc = &pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk];
505 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst));
506 for (uint32_t i = 1; i < cReqUnits; i++)
507 AssertReturnVoid(ASMBitTest(pbmAlloc, idxFirst + i));
508 ASMBitClearRange(pbmAlloc, idxFirst, idxFirst + cReqUnits);
509
510 pExecMemAllocator->aChunks[idxChunk].cFreeUnits += cReqUnits;
511 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = idxFirst;
512
513 /* Update the stats. */
514 pExecMemAllocator->cbAllocated -= cb;
515 pExecMemAllocator->cbFree += cb;
516 pExecMemAllocator->cAllocations -= 1;
517 return;
518#else
519 Assert(RTHeapSimpleSize(pExecMemAllocator->aChunks[idxChunk].hHeap, pv) == cb);
520 break;
521#endif
522 }
523 }
524# ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
525 AssertFailed();
526# else
527 Assert(fFound);
528# endif
529#endif
530
531#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
532 /* Update stats while cb is freshly calculated.*/
533 pExecMemAllocator->cbAllocated -= cb;
534 pExecMemAllocator->cbFree += RT_ALIGN_Z(cb, 64);
535 pExecMemAllocator->cAllocations -= 1;
536
537 /* Free it. */
538 RTHeapSimpleFree(NIL_RTHEAPSIMPLE, pv);
539#endif
540}
541
542
543
544#ifdef IN_RING3
545# ifdef RT_OS_WINDOWS
546
547/**
548 * Initializes the unwind info structures for windows hosts.
549 */
550static int
551iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, void *pvChunk, uint32_t idxChunk)
552{
553 /*
554 * The AMD64 unwind opcodes.
555 *
556 * This is a program that starts with RSP after a RET instruction that
557 * ends up in recompiled code, and the operations we describe here will
558 * restore all non-volatile registers and bring RSP back to where our
559 * RET address is. This means it's reverse order from what happens in
560 * the prologue.
561 *
562 * Note! Using a frame register approach here both because we have one
563 * and but mainly because the UWOP_ALLOC_LARGE argument values
564 * would be a pain to write initializers for. On the positive
565 * side, we're impervious to changes in the the stack variable
566 * area can can deal with dynamic stack allocations if necessary.
567 */
568 static const IMAGE_UNWIND_CODE s_aOpcodes[] =
569 {
570 { { 16, IMAGE_AMD64_UWOP_SET_FPREG, 0 } }, /* RSP = RBP - FrameOffset * 10 (0x60) */
571 { { 16, IMAGE_AMD64_UWOP_ALLOC_SMALL, 0 } }, /* RSP += 8; */
572 { { 14, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x15 } }, /* R15 = [RSP]; RSP += 8; */
573 { { 12, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x14 } }, /* R14 = [RSP]; RSP += 8; */
574 { { 10, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x13 } }, /* R13 = [RSP]; RSP += 8; */
575 { { 8, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_x12 } }, /* R12 = [RSP]; RSP += 8; */
576 { { 7, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xDI } }, /* RDI = [RSP]; RSP += 8; */
577 { { 6, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xSI } }, /* RSI = [RSP]; RSP += 8; */
578 { { 5, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBX } }, /* RBX = [RSP]; RSP += 8; */
579 { { 4, IMAGE_AMD64_UWOP_PUSH_NONVOL, X86_GREG_xBP } }, /* RBP = [RSP]; RSP += 8; */
580 };
581 union
582 {
583 IMAGE_UNWIND_INFO Info;
584 uint8_t abPadding[RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes) + 16];
585 } s_UnwindInfo =
586 {
587 {
588 /* .Version = */ 1,
589 /* .Flags = */ 0,
590 /* .SizeOfProlog = */ 16, /* whatever */
591 /* .CountOfCodes = */ RT_ELEMENTS(s_aOpcodes),
592 /* .FrameRegister = */ X86_GREG_xBP,
593 /* .FrameOffset = */ (-IEMNATIVE_FP_OFF_LAST_PUSH + 8) / 16 /* we're off by one slot. sigh. */,
594 }
595 };
596 AssertCompile(-IEMNATIVE_FP_OFF_LAST_PUSH < 240 && -IEMNATIVE_FP_OFF_LAST_PUSH > 0);
597 AssertCompile((-IEMNATIVE_FP_OFF_LAST_PUSH & 0xf) == 8);
598
599 /*
600 * Calc how much space we need and allocate it off the exec heap.
601 */
602 unsigned const cFunctionEntries = 1;
603 unsigned const cbUnwindInfo = sizeof(s_aOpcodes) + RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes);
604 unsigned const cbNeeded = sizeof(IMAGE_RUNTIME_FUNCTION_ENTRY) * cFunctionEntries + cbUnwindInfo;
605# ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
606 unsigned const cbNeededAligned = RT_ALIGN_32(cbNeeded, IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SIZE);
607 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions
608 = (PIMAGE_RUNTIME_FUNCTION_ENTRY)iemExecMemAllocatorAllocInChunk(pExecMemAllocator, idxChunk, cbNeededAligned);
609# else
610 unsigned const cbNeededAligned = RT_ALIGN_32(cbNeeded + pExecMemAllocator->cbHeapBlockHdr, 64)
611 - pExecMemAllocator->cbHeapBlockHdr;
612 PIMAGE_RUNTIME_FUNCTION_ENTRY const paFunctions = (PIMAGE_RUNTIME_FUNCTION_ENTRY)RTHeapSimpleAlloc(hHeap, cbNeededAligned,
613 32 /*cbAlignment*/);
614# endif
615 AssertReturn(paFunctions, VERR_INTERNAL_ERROR_5);
616 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = paFunctions;
617
618 /*
619 * Initialize the structures.
620 */
621 PIMAGE_UNWIND_INFO const pInfo = (PIMAGE_UNWIND_INFO)&paFunctions[cFunctionEntries];
622
623 paFunctions[0].BeginAddress = 0;
624 paFunctions[0].EndAddress = pExecMemAllocator->cbChunk;
625 paFunctions[0].UnwindInfoAddress = (uint32_t)((uintptr_t)pInfo - (uintptr_t)pvChunk);
626
627 memcpy(pInfo, &s_UnwindInfo, RT_UOFFSETOF(IMAGE_UNWIND_INFO, aOpcodes));
628 memcpy(&pInfo->aOpcodes[0], s_aOpcodes, sizeof(s_aOpcodes));
629
630 /*
631 * Register it.
632 */
633 uint8_t fRet = RtlAddFunctionTable(paFunctions, cFunctionEntries, (uintptr_t)pvChunk);
634 AssertReturn(fRet, NULL); /* Nothing to clean up on failure, since its within the chunk itself. */
635
636 return paFunctions;
637}
638
639
640# else /* !RT_OS_WINDOWS */
641
642/**
643 * Emits a LEB128 encoded value between -0x2000 and 0x2000 (both exclusive).
644 */
645DECLINLINE(RTPTRUNION) iemDwarfPutLeb128(RTPTRUNION Ptr, int32_t iValue)
646{
647 if (iValue >= 64)
648 {
649 Assert(iValue < 0x2000);
650 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
651 *Ptr.pb++ = (uint8_t)(iValue >> 7) & 0x3f;
652 }
653 else if (iValue >= 0)
654 *Ptr.pb++ = (uint8_t)iValue;
655 else if (iValue > -64)
656 *Ptr.pb++ = ((uint8_t)iValue & 0x3f) | 0x40;
657 else
658 {
659 Assert(iValue > -0x2000);
660 *Ptr.pb++ = ((uint8_t)iValue & 0x7f) | 0x80;
661 *Ptr.pb++ = ((uint8_t)(iValue >> 7) & 0x3f) | 0x40;
662 }
663 return Ptr;
664}
665
666
667/**
668 * Emits an ULEB128 encoded value (up to 64-bit wide).
669 */
670DECLINLINE(RTPTRUNION) iemDwarfPutUleb128(RTPTRUNION Ptr, uint64_t uValue)
671{
672 while (uValue >= 0x80)
673 {
674 *Ptr.pb++ = ((uint8_t)uValue & 0x7f) | 0x80;
675 uValue >>= 7;
676 }
677 *Ptr.pb++ = (uint8_t)uValue;
678 return Ptr;
679}
680
681
682/**
683 * Emits a CFA rule as register @a uReg + offset @a off.
684 */
685DECLINLINE(RTPTRUNION) iemDwarfPutCfaDefCfa(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
686{
687 *Ptr.pb++ = DW_CFA_def_cfa;
688 Ptr = iemDwarfPutUleb128(Ptr, uReg);
689 Ptr = iemDwarfPutUleb128(Ptr, off);
690 return Ptr;
691}
692
693
694/**
695 * Emits a register (@a uReg) save location:
696 * CFA + @a off * data_alignment_factor
697 */
698DECLINLINE(RTPTRUNION) iemDwarfPutCfaOffset(RTPTRUNION Ptr, uint32_t uReg, uint32_t off)
699{
700 if (uReg < 0x40)
701 *Ptr.pb++ = DW_CFA_offset | uReg;
702 else
703 {
704 *Ptr.pb++ = DW_CFA_offset_extended;
705 Ptr = iemDwarfPutUleb128(Ptr, uReg);
706 }
707 Ptr = iemDwarfPutUleb128(Ptr, off);
708 return Ptr;
709}
710
711
712# if 0 /* unused */
713/**
714 * Emits a register (@a uReg) save location, using signed offset:
715 * CFA + @a offSigned * data_alignment_factor
716 */
717DECLINLINE(RTPTRUNION) iemDwarfPutCfaSignedOffset(RTPTRUNION Ptr, uint32_t uReg, int32_t offSigned)
718{
719 *Ptr.pb++ = DW_CFA_offset_extended_sf;
720 Ptr = iemDwarfPutUleb128(Ptr, uReg);
721 Ptr = iemDwarfPutLeb128(Ptr, offSigned);
722 return Ptr;
723}
724# endif
725
726
727/**
728 * Initializes the unwind info section for non-windows hosts.
729 */
730static int
731iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(PIEMEXECMEMALLOCATOR pExecMemAllocator, void *pvChunk, uint32_t idxChunk)
732{
733 PIEMEXECMEMCHUNKEHFRAME const pEhFrame = &pExecMemAllocator->paEhFrames[idxChunk];
734 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = pEhFrame; /* not necessary, but whatever */
735
736 RTPTRUNION Ptr = { pEhFrame->abEhFrame };
737
738 /*
739 * Generate the CIE first.
740 */
741# ifdef IEMNATIVE_USE_LIBUNWIND /* libunwind (llvm, darwin) only supports v1 and v3. */
742 uint8_t const iDwarfVer = 3;
743# else
744 uint8_t const iDwarfVer = 4;
745# endif
746 RTPTRUNION const PtrCie = Ptr;
747 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
748 *Ptr.pu32++ = 0 /*UINT32_MAX*/; /* I'm a CIE in .eh_frame speak. */
749 *Ptr.pb++ = iDwarfVer; /* DwARF version */
750 *Ptr.pb++ = 0; /* Augmentation. */
751 if (iDwarfVer >= 4)
752 {
753 *Ptr.pb++ = sizeof(uintptr_t); /* Address size. */
754 *Ptr.pb++ = 0; /* Segment selector size. */
755 }
756# ifdef RT_ARCH_AMD64
757 Ptr = iemDwarfPutLeb128(Ptr, 1); /* Code alignment factor (LEB128 = 1). */
758# else
759 Ptr = iemDwarfPutLeb128(Ptr, 4); /* Code alignment factor (LEB128 = 4). */
760# endif
761 Ptr = iemDwarfPutLeb128(Ptr, -8); /* Data alignment factor (LEB128 = -8). */
762# ifdef RT_ARCH_AMD64
763 Ptr = iemDwarfPutUleb128(Ptr, DWREG_AMD64_RA); /* Return address column (ULEB128) */
764# elif defined(RT_ARCH_ARM64)
765 Ptr = iemDwarfPutUleb128(Ptr, DWREG_ARM64_LR); /* Return address column (ULEB128) */
766# else
767# error "port me"
768# endif
769 /* Initial instructions: */
770# ifdef RT_ARCH_AMD64
771 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_AMD64_RBP, 16); /* CFA = RBP + 0x10 - first stack parameter */
772 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RA, 1); /* Ret RIP = [CFA + 1*-8] */
773 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBP, 2); /* RBP = [CFA + 2*-8] */
774 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_RBX, 3); /* RBX = [CFA + 3*-8] */
775 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R12, 4); /* R12 = [CFA + 4*-8] */
776 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R13, 5); /* R13 = [CFA + 5*-8] */
777 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R14, 6); /* R14 = [CFA + 6*-8] */
778 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_AMD64_R15, 7); /* R15 = [CFA + 7*-8] */
779# elif defined(RT_ARCH_ARM64)
780# if 1
781 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_BP, 16); /* CFA = BP + 0x10 - first stack parameter */
782# else
783 Ptr = iemDwarfPutCfaDefCfa(Ptr, DWREG_ARM64_SP, IEMNATIVE_FRAME_VAR_SIZE + IEMNATIVE_FRAME_SAVE_REG_SIZE);
784# endif
785 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_LR, 1); /* Ret PC = [CFA + 1*-8] */
786 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_BP, 2); /* Ret BP = [CFA + 2*-8] */
787 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X28, 3); /* X28 = [CFA + 3*-8] */
788 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X27, 4); /* X27 = [CFA + 4*-8] */
789 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X26, 5); /* X26 = [CFA + 5*-8] */
790 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X25, 6); /* X25 = [CFA + 6*-8] */
791 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X24, 7); /* X24 = [CFA + 7*-8] */
792 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X23, 8); /* X23 = [CFA + 8*-8] */
793 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X22, 9); /* X22 = [CFA + 9*-8] */
794 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X21, 10); /* X21 = [CFA +10*-8] */
795 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X20, 11); /* X20 = [CFA +11*-8] */
796 Ptr = iemDwarfPutCfaOffset(Ptr, DWREG_ARM64_X19, 12); /* X19 = [CFA +12*-8] */
797 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
798 /** @todo we we need to do something about clearing DWREG_ARM64_RA_SIGN_STATE or something? */
799# else
800# error "port me"
801# endif
802 while ((Ptr.u - PtrCie.u) & 3)
803 *Ptr.pb++ = DW_CFA_nop;
804 /* Finalize the CIE size. */
805 *PtrCie.pu32 = Ptr.u - PtrCie.u - sizeof(uint32_t);
806
807 /*
808 * Generate an FDE for the whole chunk area.
809 */
810# ifdef IEMNATIVE_USE_LIBUNWIND
811 pEhFrame->offFda = Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0];
812# endif
813 RTPTRUNION const PtrFde = Ptr;
814 *Ptr.pu32++ = 123; /* The CIE length will be determined later. */
815 *Ptr.pu32 = Ptr.u - PtrCie.u; /* Negated self relative CIE address. */
816 Ptr.pu32++;
817 *Ptr.pu64++ = (uintptr_t)pvChunk; /* Absolute start PC of this FDE. */
818 *Ptr.pu64++ = pExecMemAllocator->cbChunk; /* PC range length for this PDE. */
819# if 0 /* not requried for recent libunwind.dylib nor recent libgcc/glib. */
820 *Ptr.pb++ = DW_CFA_nop;
821# endif
822 while ((Ptr.u - PtrFde.u) & 3)
823 *Ptr.pb++ = DW_CFA_nop;
824 /* Finalize the FDE size. */
825 *PtrFde.pu32 = Ptr.u - PtrFde.u - sizeof(uint32_t);
826
827 /* Terminator entry. */
828 *Ptr.pu32++ = 0;
829 *Ptr.pu32++ = 0; /* just to be sure... */
830 Assert(Ptr.u - (uintptr_t)&pEhFrame->abEhFrame[0] <= sizeof(pEhFrame->abEhFrame));
831
832 /*
833 * Register it.
834 */
835# ifdef IEMNATIVE_USE_LIBUNWIND
836 __register_frame(&pEhFrame->abEhFrame[pEhFrame->offFda]);
837# else
838 memset(pEhFrame->abObject, 0xf6, sizeof(pEhFrame->abObject)); /* color the memory to better spot usage */
839 __register_frame_info(pEhFrame->abEhFrame, pEhFrame->abObject);
840# endif
841
842 return VINF_SUCCESS;
843}
844
845# endif /* !RT_OS_WINDOWS */
846#endif /* IN_RING3 */
847
848
849/**
850 * Adds another chunk to the executable memory allocator.
851 *
852 * This is used by the init code for the initial allocation and later by the
853 * regular allocator function when it's out of memory.
854 */
855static int iemExecMemAllocatorGrow(PIEMEXECMEMALLOCATOR pExecMemAllocator)
856{
857 /* Check that we've room for growth. */
858 uint32_t const idxChunk = pExecMemAllocator->cChunks;
859 AssertLogRelReturn(idxChunk < pExecMemAllocator->cMaxChunks, VERR_OUT_OF_RESOURCES);
860
861 /* Allocate a chunk. */
862#ifdef RT_OS_DARWIN
863 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, 0);
864#else
865 void *pvChunk = RTMemPageAllocEx(pExecMemAllocator->cbChunk, RTMEMPAGEALLOC_F_EXECUTABLE);
866#endif
867 AssertLogRelReturn(pvChunk, VERR_NO_EXEC_MEMORY);
868
869#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
870 int rc = VINF_SUCCESS;
871#else
872 /* Initialize the heap for the chunk. */
873 RTHEAPSIMPLE hHeap = NIL_RTHEAPSIMPLE;
874 int rc = RTHeapSimpleInit(&hHeap, pvChunk, pExecMemAllocator->cbChunk);
875 AssertRC(rc);
876 if (RT_SUCCESS(rc))
877 {
878 /*
879 * We want the memory to be aligned on 64 byte, so the first time thru
880 * here we do some exploratory allocations to see how we can achieve this.
881 * On subsequent runs we only make an initial adjustment allocation, if
882 * necessary.
883 *
884 * Since we own the heap implementation, we know that the internal block
885 * header is 32 bytes in size for 64-bit systems (see RTHEAPSIMPLEBLOCK),
886 * so all we need to wrt allocation size adjustments is to add 32 bytes
887 * to the size, align up by 64 bytes, and subtract 32 bytes.
888 *
889 * The heap anchor block is 8 * sizeof(void *) (see RTHEAPSIMPLEINTERNAL),
890 * which mean 64 bytes on a 64-bit system, so we need to make a 64 byte
891 * allocation to force subsequent allocations to return 64 byte aligned
892 * user areas.
893 */
894 if (!pExecMemAllocator->cbHeapBlockHdr)
895 {
896 pExecMemAllocator->cbHeapBlockHdr = sizeof(void *) * 4; /* See RTHEAPSIMPLEBLOCK. */
897 pExecMemAllocator->cbHeapAlignTweak = 64;
898 pExecMemAllocator->pvAlignTweak = RTHeapSimpleAlloc(hHeap, pExecMemAllocator->cbHeapAlignTweak,
899 32 /*cbAlignment*/);
900 AssertStmt(pExecMemAllocator->pvAlignTweak, rc = VERR_INTERNAL_ERROR_2);
901
902 void *pvTest1 = RTHeapSimpleAlloc(hHeap,
903 RT_ALIGN_32(256 + pExecMemAllocator->cbHeapBlockHdr, 64)
904 - pExecMemAllocator->cbHeapBlockHdr, 32 /*cbAlignment*/);
905 AssertStmt(pvTest1, rc = VERR_INTERNAL_ERROR_2);
906 AssertStmt(!((uintptr_t)pvTest1 & 63), rc = VERR_INTERNAL_ERROR_3);
907
908 void *pvTest2 = RTHeapSimpleAlloc(hHeap,
909 RT_ALIGN_32(687 + pExecMemAllocator->cbHeapBlockHdr, 64)
910 - pExecMemAllocator->cbHeapBlockHdr, 32 /*cbAlignment*/);
911 AssertStmt(pvTest2, rc = VERR_INTERNAL_ERROR_2);
912 AssertStmt(!((uintptr_t)pvTest2 & 63), rc = VERR_INTERNAL_ERROR_3);
913
914 RTHeapSimpleFree(hHeap, pvTest2);
915 RTHeapSimpleFree(hHeap, pvTest1);
916 }
917 else
918 {
919 pExecMemAllocator->pvAlignTweak = RTHeapSimpleAlloc(hHeap, pExecMemAllocator->cbHeapAlignTweak, 32 /*cbAlignment*/);
920 AssertStmt(pExecMemAllocator->pvAlignTweak, rc = VERR_INTERNAL_ERROR_4);
921 }
922 if (RT_SUCCESS(rc))
923#endif /* !IEMEXECMEM_USE_ALT_SUB_ALLOCATOR */
924 {
925 /*
926 * Add the chunk.
927 *
928 * This must be done before the unwind init so windows can allocate
929 * memory from the chunk when using the alternative sub-allocator.
930 */
931 pExecMemAllocator->aChunks[idxChunk].pvChunk = pvChunk;
932#ifdef IN_RING3
933 pExecMemAllocator->aChunks[idxChunk].pvUnwindInfo = NULL;
934#endif
935#ifndef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
936 pExecMemAllocator->aChunks[idxChunk].hHeap = hHeap;
937#else
938 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = pExecMemAllocator->cUnitsPerChunk;
939 pExecMemAllocator->aChunks[idxChunk].idxFreeHint = 0;
940 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
941 0, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
942#endif
943
944 pExecMemAllocator->cChunks = idxChunk + 1;
945 pExecMemAllocator->idxChunkHint = idxChunk;
946
947#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
948 pExecMemAllocator->cbTotal += pExecMemAllocator->cbChunk;
949 pExecMemAllocator->cbFree += pExecMemAllocator->cbChunk;
950#else
951 size_t const cbFree = RTHeapSimpleGetFreeSize(hHeap);
952 pExecMemAllocator->cbTotal += cbFree;
953 pExecMemAllocator->cbFree += cbFree;
954#endif
955
956#ifdef IN_RING3
957 /*
958 * Initialize the unwind information (this cannot really fail atm).
959 * (This sets pvUnwindInfo.)
960 */
961 rc = iemExecMemAllocatorInitAndRegisterUnwindInfoForChunk(pExecMemAllocator, pvChunk, idxChunk);
962 if (RT_SUCCESS(rc))
963#endif
964 {
965 return VINF_SUCCESS;
966 }
967
968#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
969 /* Just in case the impossible happens, undo the above up: */
970 pExecMemAllocator->cbTotal -= pExecMemAllocator->cbChunk;
971 pExecMemAllocator->cbFree -= pExecMemAllocator->aChunks[idxChunk].cFreeUnits << IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
972 pExecMemAllocator->cChunks = idxChunk;
973 memset(&pExecMemAllocator->pbmAlloc[pExecMemAllocator->cBitmapElementsPerChunk * idxChunk],
974 0xff, sizeof(pExecMemAllocator->pbmAlloc[0]) * pExecMemAllocator->cBitmapElementsPerChunk);
975 pExecMemAllocator->aChunks[idxChunk].pvChunk = NULL;
976 pExecMemAllocator->aChunks[idxChunk].cFreeUnits = 0;
977#endif
978 }
979#ifndef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
980 }
981#endif
982 RTMemPageFree(pvChunk, pExecMemAllocator->cbChunk);
983 return rc;
984}
985
986
987/**
988 * Initializes the executable memory allocator for native recompilation on the
989 * calling EMT.
990 *
991 * @returns VBox status code.
992 * @param pVCpu The cross context virtual CPU structure of the calling
993 * thread.
994 * @param cbMax The max size of the allocator.
995 * @param cbInitial The initial allocator size.
996 * @param cbChunk The chunk size, 0 or UINT32_MAX for default (@a cbMax
997 * dependent).
998 */
999int iemExecMemAllocatorInit(PVMCPU pVCpu, uint64_t cbMax, uint64_t cbInitial, uint32_t cbChunk)
1000{
1001 /*
1002 * Validate input.
1003 */
1004 AssertLogRelMsgReturn(cbMax >= _1M && cbMax <= _4G+_4G, ("cbMax=%RU64 (%RX64)\n", cbMax, cbMax), VERR_OUT_OF_RANGE);
1005 AssertReturn(cbInitial <= cbMax, VERR_OUT_OF_RANGE);
1006 AssertLogRelMsgReturn( cbChunk != UINT32_MAX
1007 || cbChunk == 0
1008 || ( RT_IS_POWER_OF_TWO(cbChunk)
1009 && cbChunk >= _1M
1010 && cbChunk <= _256M
1011 && cbChunk <= cbMax),
1012 ("cbChunk=%RU32 (%RX32) cbMax=%RU64\n", cbChunk, cbChunk, cbMax),
1013 VERR_OUT_OF_RANGE);
1014
1015 /*
1016 * Adjust/figure out the chunk size.
1017 */
1018 if (cbChunk == 0 || cbChunk == UINT32_MAX)
1019 {
1020 if (cbMax >= _256M)
1021 cbChunk = _64M;
1022 else
1023 {
1024 if (cbMax < _16M)
1025 cbChunk = cbMax >= _4M ? _4M : (uint32_t)cbMax;
1026 else
1027 cbChunk = (uint32_t)cbMax / 4;
1028 if (!RT_IS_POWER_OF_TWO(cbChunk))
1029 cbChunk = RT_BIT_32(ASMBitLastSetU32(cbChunk));
1030 }
1031 }
1032
1033 if (cbChunk > cbMax)
1034 cbMax = cbChunk;
1035 else
1036 cbMax = (cbMax - 1 + cbChunk) / cbChunk * cbChunk;
1037 uint32_t const cMaxChunks = (uint32_t)(cbMax / cbChunk);
1038 AssertLogRelReturn((uint64_t)cMaxChunks * cbChunk == cbMax, VERR_INTERNAL_ERROR_3);
1039
1040 /*
1041 * Allocate and initialize the allocatore instance.
1042 */
1043 size_t cbNeeded = RT_UOFFSETOF_DYN(IEMEXECMEMALLOCATOR, aChunks[cMaxChunks]);
1044#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
1045 size_t const offBitmaps = RT_ALIGN_Z(cbNeeded, RT_CACHELINE_SIZE);
1046 size_t const cbBitmap = cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3);
1047 cbNeeded += cbBitmap * cMaxChunks;
1048 AssertCompile(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT <= 10);
1049 Assert(cbChunk > RT_BIT_32(IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 3));
1050#endif
1051#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
1052 size_t const offEhFrames = RT_ALIGN_Z(cbNeeded, RT_CACHELINE_SIZE);
1053 cbNeeded += sizeof(IEMEXECMEMCHUNKEHFRAME) * cMaxChunks;
1054#endif
1055 PIEMEXECMEMALLOCATOR pExecMemAllocator = (PIEMEXECMEMALLOCATOR)RTMemAllocZ(cbNeeded);
1056 AssertLogRelMsgReturn(pExecMemAllocator, ("cbNeeded=%zx cMaxChunks=%#x cbChunk=%#x\n", cbNeeded, cMaxChunks, cbChunk),
1057 VERR_NO_MEMORY);
1058 pExecMemAllocator->uMagic = IEMEXECMEMALLOCATOR_MAGIC;
1059 pExecMemAllocator->cbChunk = cbChunk;
1060 pExecMemAllocator->cMaxChunks = cMaxChunks;
1061 pExecMemAllocator->cChunks = 0;
1062 pExecMemAllocator->idxChunkHint = 0;
1063 pExecMemAllocator->cAllocations = 0;
1064 pExecMemAllocator->cbTotal = 0;
1065 pExecMemAllocator->cbFree = 0;
1066 pExecMemAllocator->cbAllocated = 0;
1067#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
1068 pExecMemAllocator->pbmAlloc = (uint64_t *)((uintptr_t)pExecMemAllocator + offBitmaps);
1069 pExecMemAllocator->cUnitsPerChunk = cbChunk >> IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT;
1070 pExecMemAllocator->cBitmapElementsPerChunk = cbChunk >> (IEMEXECMEM_ALT_SUB_ALLOC_UNIT_SHIFT + 6);
1071 memset(pExecMemAllocator->pbmAlloc, 0xff, cbBitmap); /* Mark everything as allocated. Clear when chunks are added. */
1072#endif
1073#if defined(IN_RING3) && !defined(RT_OS_WINDOWS)
1074 pExecMemAllocator->paEhFrames = (PIEMEXECMEMCHUNKEHFRAME)((uintptr_t)pExecMemAllocator + offEhFrames);
1075#endif
1076 for (uint32_t i = 0; i < cMaxChunks; i++)
1077 {
1078#ifdef IEMEXECMEM_USE_ALT_SUB_ALLOCATOR
1079 pExecMemAllocator->aChunks[i].cFreeUnits = 0;
1080 pExecMemAllocator->aChunks[i].idxFreeHint = 0;
1081#else
1082 pExecMemAllocator->aChunks[i].hHeap = NIL_RTHEAPSIMPLE;
1083#endif
1084 pExecMemAllocator->aChunks[i].pvChunk = NULL;
1085#ifdef IN_RING0
1086 pExecMemAllocator->aChunks[i].hMemObj = NIL_RTR0MEMOBJ;
1087#else
1088 pExecMemAllocator->aChunks[i].pvUnwindInfo = NULL;
1089#endif
1090 }
1091 pVCpu->iem.s.pExecMemAllocatorR3 = pExecMemAllocator;
1092
1093 /*
1094 * Do the initial allocations.
1095 */
1096 while (cbInitial < (uint64_t)pExecMemAllocator->cChunks * pExecMemAllocator->cbChunk)
1097 {
1098 int rc = iemExecMemAllocatorGrow(pExecMemAllocator);
1099 AssertLogRelRCReturn(rc, rc);
1100 }
1101
1102 pExecMemAllocator->idxChunkHint = 0;
1103
1104 return VINF_SUCCESS;
1105}
1106
1107
1108/*********************************************************************************************************************************
1109* Native Recompilation *
1110*********************************************************************************************************************************/
1111
1112
1113/**
1114 * Used by TB code when encountering a non-zero status or rcPassUp after a call.
1115 */
1116IEM_DECL_IMPL_DEF(int, iemNativeHlpExecStatusCodeFiddling,(PVMCPUCC pVCpu, int rc, uint8_t idxInstr))
1117{
1118 pVCpu->iem.s.cInstructions += idxInstr;
1119 return VBOXSTRICTRC_VAL(iemExecStatusCodeFiddling(pVCpu, rc == VINF_IEM_REEXEC_BREAK ? VINF_SUCCESS : rc));
1120}
1121
1122
1123/**
1124 * Reinitializes the native recompiler state.
1125 *
1126 * Called before starting a new recompile job.
1127 */
1128static PIEMRECOMPILERSTATE iemNativeReInit(PIEMRECOMPILERSTATE pReNative)
1129{
1130 pReNative->cLabels = 0;
1131 pReNative->cFixups = 0;
1132 return pReNative;
1133}
1134
1135
1136/**
1137 * Allocates and initializes the native recompiler state.
1138 *
1139 * This is called the first time an EMT wants to recompile something.
1140 *
1141 * @returns Pointer to the new recompiler state.
1142 * @param pVCpu The cross context virtual CPU structure of the calling
1143 * thread.
1144 * @thread EMT(pVCpu)
1145 */
1146static PIEMRECOMPILERSTATE iemNativeInit(PVMCPUCC pVCpu)
1147{
1148 VMCPU_ASSERT_EMT(pVCpu);
1149
1150 PIEMRECOMPILERSTATE pReNative = (PIEMRECOMPILERSTATE)RTMemAllocZ(sizeof(*pReNative));
1151 AssertReturn(pReNative, NULL);
1152
1153 /*
1154 * Try allocate all the buffers and stuff we need.
1155 */
1156 pReNative->pInstrBuf = (PIEMNATIVEINSTR)RTMemAllocZ(_64K);
1157 pReNative->paLabels = (PIEMNATIVELABEL)RTMemAllocZ(sizeof(IEMNATIVELABEL) * _8K);
1158 pReNative->paFixups = (PIEMNATIVEFIXUP)RTMemAllocZ(sizeof(IEMNATIVEFIXUP) * _16K);
1159 if (RT_LIKELY( pReNative->pInstrBuf
1160 && pReNative->paLabels
1161 && pReNative->paFixups))
1162 {
1163 /*
1164 * Set the buffer & array sizes on success.
1165 */
1166 pReNative->cInstrBufAlloc = _64K / sizeof(IEMNATIVEINSTR);
1167 pReNative->cLabelsAlloc = _8K;
1168 pReNative->cFixupsAlloc = _16K;
1169
1170 /*
1171 * Done, just need to save it and reinit it.
1172 */
1173 pVCpu->iem.s.pNativeRecompilerStateR3 = pReNative;
1174 return iemNativeReInit(pReNative);
1175 }
1176
1177 /*
1178 * Failed. Cleanup and return.
1179 */
1180 AssertFailed();
1181 RTMemFree(pReNative->pInstrBuf);
1182 RTMemFree(pReNative->paLabels);
1183 RTMemFree(pReNative->paFixups);
1184 RTMemFree(pReNative);
1185 return NULL;
1186}
1187
1188
1189/**
1190 * Defines a label.
1191 *
1192 * @returns Label ID.
1193 * @param pReNative The native recompile state.
1194 * @param enmType The label type.
1195 * @param offWhere The instruction offset of the label. UINT32_MAX if the
1196 * label is not yet defined (default).
1197 * @param uData Data associated with the lable. Only applicable to
1198 * certain type of labels. Default is zero.
1199 */
1200DECLHIDDEN(uint32_t) iemNativeMakeLabel(PIEMRECOMPILERSTATE pReNative, IEMNATIVELABELTYPE enmType,
1201 uint32_t offWhere /*= UINT32_MAX*/, uint16_t uData /*= 0*/) RT_NOEXCEPT
1202{
1203 /*
1204 * Do we have the label already?
1205 */
1206 PIEMNATIVELABEL paLabels = pReNative->paLabels;
1207 uint32_t const cLabels = pReNative->cLabels;
1208 for (uint32_t i = 0; i < cLabels; i++)
1209 if ( paLabels[i].enmType == enmType
1210 && paLabels[i].uData == uData)
1211 {
1212 if (paLabels[i].off == offWhere || offWhere == UINT32_MAX)
1213 return i;
1214 if (paLabels[i].off == UINT32_MAX)
1215 {
1216 paLabels[i].off = offWhere;
1217 return i;
1218 }
1219 }
1220
1221 /*
1222 * Make sure we've got room for another label.
1223 */
1224 if (RT_LIKELY(cLabels < pReNative->cLabelsAlloc))
1225 { /* likely */ }
1226 else
1227 {
1228 uint32_t cNew = pReNative->cLabelsAlloc;
1229 AssertReturn(cNew, UINT32_MAX);
1230 AssertReturn(cLabels == cNew, UINT32_MAX);
1231 cNew *= 2;
1232 AssertReturn(cNew <= _64K, UINT32_MAX); /* IEMNATIVEFIXUP::idxLabel type restrict this */
1233 paLabels = (PIEMNATIVELABEL)RTMemRealloc(paLabels, cNew * sizeof(paLabels[0]));
1234 AssertReturn(paLabels, UINT32_MAX);
1235 pReNative->paLabels = paLabels;
1236 pReNative->cLabelsAlloc = cNew;
1237 }
1238
1239 /*
1240 * Define a new label.
1241 */
1242 paLabels[cLabels].off = offWhere;
1243 paLabels[cLabels].enmType = enmType;
1244 paLabels[cLabels].uData = uData;
1245 pReNative->cLabels = cLabels + 1;
1246 return cLabels;
1247}
1248
1249
1250/**
1251 * Looks up a lable.
1252 *
1253 * @returns Label ID if found, UINT32_MAX if not.
1254 */
1255static uint32_t iemNativeFindLabel(PIEMRECOMPILERSTATE pReNative, IEMNATIVELABELTYPE enmType,
1256 uint32_t offWhere = UINT32_MAX, uint16_t uData = 0) RT_NOEXCEPT
1257{
1258 PIEMNATIVELABEL paLabels = pReNative->paLabels;
1259 uint32_t const cLabels = pReNative->cLabels;
1260 for (uint32_t i = 0; i < cLabels; i++)
1261 if ( paLabels[i].enmType == enmType
1262 && paLabels[i].uData == uData
1263 && ( paLabels[i].off == offWhere
1264 || offWhere == UINT32_MAX
1265 || paLabels[i].off == UINT32_MAX))
1266 return i;
1267 return UINT32_MAX;
1268}
1269
1270
1271
1272/**
1273 * Adds a fixup.
1274 *
1275 * @returns Success indicator.
1276 * @param pReNative The native recompile state.
1277 * @param offWhere The instruction offset of the fixup location.
1278 * @param idxLabel The target label ID for the fixup.
1279 * @param enmType The fixup type.
1280 * @param offAddend Fixup addend if applicable to the type. Default is 0.
1281 */
1282DECLHIDDEN(bool) iemNativeAddFixup(PIEMRECOMPILERSTATE pReNative, uint32_t offWhere, uint32_t idxLabel,
1283 IEMNATIVEFIXUPTYPE enmType, int8_t offAddend /*= 0*/) RT_NOEXCEPT
1284{
1285 Assert(idxLabel <= UINT16_MAX);
1286 Assert((unsigned)enmType <= UINT8_MAX);
1287
1288 /*
1289 * Make sure we've room.
1290 */
1291 PIEMNATIVEFIXUP paFixups = pReNative->paFixups;
1292 uint32_t const cFixups = pReNative->cFixups;
1293 if (RT_LIKELY(cFixups < pReNative->cFixupsAlloc))
1294 { /* likely */ }
1295 else
1296 {
1297 uint32_t cNew = pReNative->cFixupsAlloc;
1298 AssertReturn(cNew, false);
1299 AssertReturn(cFixups == cNew, false);
1300 cNew *= 2;
1301 AssertReturn(cNew <= _128K, false);
1302 paFixups = (PIEMNATIVEFIXUP)RTMemRealloc(paFixups, cNew * sizeof(paFixups[0]));
1303 AssertReturn(paFixups, false);
1304 pReNative->paFixups = paFixups;
1305 pReNative->cFixupsAlloc = cNew;
1306 }
1307
1308 /*
1309 * Add the fixup.
1310 */
1311 paFixups[cFixups].off = offWhere;
1312 paFixups[cFixups].idxLabel = (uint16_t)idxLabel;
1313 paFixups[cFixups].enmType = enmType;
1314 paFixups[cFixups].offAddend = offAddend;
1315 pReNative->cFixups = cFixups + 1;
1316 return true;
1317}
1318
1319/**
1320 * Slow code path for iemNativeInstrBufEnsure.
1321 */
1322DECLHIDDEN(PIEMNATIVEINSTR) iemNativeInstrBufEnsureSlow(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1323 uint32_t cInstrReq) RT_NOEXCEPT
1324{
1325 /* Double the buffer size till we meet the request. */
1326 uint32_t cNew = pReNative->cInstrBufAlloc;
1327 AssertReturn(cNew > 0, NULL);
1328 do
1329 cNew *= 2;
1330 while (cNew < off + cInstrReq);
1331
1332 uint32_t const cbNew = cNew * sizeof(IEMNATIVEINSTR);
1333 AssertReturn(cbNew <= _2M, NULL);
1334
1335 void *pvNew = RTMemRealloc(pReNative->pInstrBuf, cbNew);
1336 AssertReturn(pvNew, NULL);
1337
1338 pReNative->cInstrBufAlloc = cNew;
1339 return pReNative->pInstrBuf = (PIEMNATIVEINSTR)pvNew;
1340}
1341
1342
1343/**
1344 * Emits a code for checking the return code of a call and rcPassUp, returning
1345 * from the code if either are non-zero.
1346 */
1347DECLHIDDEN(uint32_t) iemNativeEmitCheckCallRetAndPassUp(PIEMRECOMPILERSTATE pReNative, uint32_t off,
1348 uint8_t idxInstr) RT_NOEXCEPT
1349{
1350#ifdef RT_ARCH_AMD64
1351 /*
1352 * AMD64: eax = call status code.
1353 */
1354
1355 /* edx = rcPassUp */
1356 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, X86_GREG_xDX, RT_UOFFSETOF(VMCPUCC, iem.s.rcPassUp));
1357 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1358
1359 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 10);
1360 AssertReturn(pbCodeBuf, UINT32_MAX);
1361
1362 /* edx = eax | rcPassUp*/
1363 pbCodeBuf[off++] = 0x0b; /* or edx, eax */
1364 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xDX, X86_GREG_xAX);
1365
1366 /* Jump to non-zero status return path, loading cl with the instruction number. */
1367 pbCodeBuf[off++] = 0xb0 + X86_GREG_xCX; /* mov cl, imm8 (pCallEntry->idxInstr) */
1368 pbCodeBuf[off++] = idxInstr;
1369
1370 pbCodeBuf[off++] = 0x0f; /* jnz rel32 */
1371 pbCodeBuf[off++] = 0x85;
1372 uint32_t const idxLabel = iemNativeMakeLabel(pReNative, kIemNativeLabelType_NonZeroRetOrPassUp);
1373 AssertReturn(idxLabel != UINT32_MAX, UINT32_MAX);
1374 AssertReturn(iemNativeAddFixup(pReNative, off, idxLabel, kIemNativeFixupType_Rel32, -4), UINT32_MAX);
1375 pbCodeBuf[off++] = 0x00;
1376 pbCodeBuf[off++] = 0x00;
1377 pbCodeBuf[off++] = 0x00;
1378 pbCodeBuf[off++] = 0x00;
1379
1380 /* done. */
1381
1382#elif RT_ARCH_ARM64
1383 /*
1384 * ARM64: w0 = call status code.
1385 */
1386 off = iemNativeEmitLoadGprImm64(pReNative, off, ARMV8_A64_REG_X2, idxInstr); /** @todo 32-bit imm load? Fixed counter register? */
1387 off = iemNativeEmitLoadGprFromVCpuU32(pReNative, off, ARMV8_A64_REG_X3, RT_UOFFSETOF(VMCPUCC, iem.s.rcPassUp));
1388
1389 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 3);
1390 AssertReturn(pu32CodeBuf, UINT32_MAX);
1391
1392 pu32CodeBuf[off++] = Armv8A64MkInstrOrr(ARMV8_A64_REG_X4, ARMV8_A64_REG_X3, ARMV8_A64_REG_X0, false /*f64Bit*/);
1393
1394 uint32_t const idxLabel = iemNativeMakeLabel(pReNative, kIemNativeLabelType_NonZeroRetOrPassUp);
1395 AssertReturn(idxLabel != UINT32_MAX, UINT32_MAX);
1396 AssertReturn(iemNativeAddFixup(pReNative, off, idxLabel, kIemNativeFixupType_RelImm19At5), UINT32_MAX);
1397 pu32CodeBuf[off++] = Armv8A64MkInstrCbzCbnz(true /*fJmpIfNotZero*/, ARMV8_A64_REG_X4, false /*f64Bit*/);
1398
1399#else
1400# error "port me"
1401#endif
1402 return off;
1403}
1404
1405
1406/**
1407 * Emits a call to a CImpl function or something similar.
1408 */
1409static int32_t iemNativeEmitCImplCall(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxInstr,
1410 uintptr_t pfnCImpl, uint8_t cbInstr, uint8_t cAddParams,
1411 uint64_t uParam0, uint64_t uParam1, uint64_t uParam2)
1412{
1413#ifdef VBOX_STRICT
1414 off = iemNativeEmitMarker(pReNative, off);
1415 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1416#endif
1417
1418 /*
1419 * Load the parameters.
1420 */
1421#if defined(RT_OS_WINDOWS) && defined(VBOXSTRICTRC_STRICT_ENABLED)
1422 /* Special code the hidden VBOXSTRICTRC pointer. */
1423 off = iemNativeEmitLoadGprFromGpr( pReNative, off, IEMNATIVE_CALL_ARG1_GREG, IEMNATIVE_REG_FIXED_PVMCPU);
1424 off = iemNativeEmitLoadGprImm64( pReNative, off, IEMNATIVE_CALL_ARG2_GREG, cbInstr); /** @todo 8-bit reg load opt for amd64 */
1425 if (cAddParams > 0)
1426 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG3_GREG, uParam0);
1427 if (cAddParams > 1)
1428 off = iemNativeEmitStoreImm64ByBp(pReNative, off, IEMNATIVE_FP_OFF_STACK_ARG0, uParam1);
1429 if (cAddParams > 2)
1430 off = iemNativeEmitStoreImm64ByBp(pReNative, off, IEMNATIVE_FP_OFF_STACK_ARG1, uParam2);
1431 off = iemNativeEmitLeaGrpByBp(pReNative, off, X86_GREG_xCX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict */
1432
1433#else
1434 AssertCompile(IEMNATIVE_CALL_ARG_GREG_COUNT >= 4);
1435 off = iemNativeEmitLoadGprFromGpr( pReNative, off, IEMNATIVE_CALL_ARG0_GREG, IEMNATIVE_REG_FIXED_PVMCPU);
1436 off = iemNativeEmitLoadGprImm64( pReNative, off, IEMNATIVE_CALL_ARG1_GREG, cbInstr); /** @todo 8-bit reg load opt for amd64 */
1437 if (cAddParams > 0)
1438 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG2_GREG, uParam0);
1439 if (cAddParams > 1)
1440 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG3_GREG, uParam1);
1441 if (cAddParams > 2)
1442# if IEMNATIVE_CALL_ARG_GREG_COUNT >= 5
1443 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG4_GREG, uParam2);
1444# else
1445 off = iemNativeEmitStoreImm64ByBp(pReNative, off, IEMNATIVE_FP_OFF_STACK_ARG0, uParam2);
1446# endif
1447#endif
1448 AssertReturn(off != UINT32_MAX, off);
1449
1450 /*
1451 * Make the call.
1452 */
1453#ifdef RT_ARCH_AMD64
1454 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xAX, pfnCImpl);
1455
1456 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1457 AssertReturn(pbCodeBuf, UINT32_MAX);
1458 pbCodeBuf[off++] = 0xff; /* call rax */
1459 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, X86_GREG_xAX);
1460
1461# if defined(VBOXSTRICTRC_STRICT_ENABLED) && defined(RT_OS_WINDOWS)
1462 off = iemNativeEmitLoadGprByBpU32(pReNative, off, X86_GREG_xAX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict (see above) */
1463# endif
1464
1465#elif defined(RT_ARCH_ARM64)
1466 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_REG_FIXED_TMP0, pfnCImpl);
1467
1468 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1469 AssertReturn(pu32CodeBuf, UINT32_MAX);
1470 pu32CodeBuf[off++] = Armv8A64MkInstrBlr(IEMNATIVE_REG_FIXED_TMP0);
1471
1472#else
1473# error "Port me!"
1474#endif
1475
1476 /*
1477 * Check the status code.
1478 */
1479 return iemNativeEmitCheckCallRetAndPassUp(pReNative, off, idxInstr);
1480}
1481
1482
1483/**
1484 * Emits a call to a threaded worker function.
1485 */
1486static int32_t iemNativeEmitThreadedCall(PIEMRECOMPILERSTATE pReNative, uint32_t off, PCIEMTHRDEDCALLENTRY pCallEntry)
1487{
1488#ifdef VBOX_STRICT
1489 off = iemNativeEmitMarker(pReNative, off);
1490 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1491#endif
1492 uint8_t const cParams = g_acIemThreadedFunctionUsedArgs[pCallEntry->enmFunction];
1493
1494#ifdef RT_ARCH_AMD64
1495 /* Load the parameters and emit the call. */
1496# ifdef RT_OS_WINDOWS
1497# ifndef VBOXSTRICTRC_STRICT_ENABLED
1498 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xCX, IEMNATIVE_REG_FIXED_PVMCPU);
1499 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1500 if (cParams > 0)
1501 {
1502 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xDX, pCallEntry->auParams[0]);
1503 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1504 }
1505 if (cParams > 1)
1506 {
1507 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x8, pCallEntry->auParams[1]);
1508 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1509 }
1510 if (cParams > 2)
1511 {
1512 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x9, pCallEntry->auParams[2]);
1513 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1514 }
1515# else /* VBOXSTRICTRC: Returned via hidden parameter. Sigh. */
1516 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDX, IEMNATIVE_REG_FIXED_PVMCPU);
1517 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1518 if (cParams > 0)
1519 {
1520 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x8, pCallEntry->auParams[0]);
1521 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1522 }
1523 if (cParams > 1)
1524 {
1525 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x9, pCallEntry->auParams[1]);
1526 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1527 }
1528 if (cParams > 2)
1529 {
1530 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_x10, pCallEntry->auParams[2]);
1531 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1532 }
1533 off = iemNativeEmitStoreGprByBp(pReNative, off, IEMNATIVE_FP_OFF_STACK_ARG0, X86_GREG_x10);
1534 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1535 off = iemNativeEmitLeaGrpByBp(pReNative, off, X86_GREG_xCX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict */
1536 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1537# endif /* VBOXSTRICTRC_STRICT_ENABLED */
1538# else
1539 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDI, IEMNATIVE_REG_FIXED_PVMCPU);
1540 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1541 if (cParams > 0)
1542 {
1543 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xSI, pCallEntry->auParams[0]);
1544 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1545 }
1546 if (cParams > 1)
1547 {
1548 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xDX, pCallEntry->auParams[1]);
1549 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1550 }
1551 if (cParams > 2)
1552 {
1553 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xCX, pCallEntry->auParams[2]);
1554 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1555 }
1556# endif
1557 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xAX, (uintptr_t)g_apfnIemThreadedFunctions[pCallEntry->enmFunction]);
1558 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1559
1560 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1561 AssertReturn(pbCodeBuf, UINT32_MAX);
1562 pbCodeBuf[off++] = 0xff; /* call rax */
1563 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, X86_GREG_xAX);
1564
1565# if defined(VBOXSTRICTRC_STRICT_ENABLED) && defined(RT_OS_WINDOWS)
1566 off = iemNativeEmitLoadGprByBpU32(pReNative, off, X86_GREG_xAX, IEMNATIVE_FP_OFF_IN_SHADOW_ARG0); /* rcStrict (see above) */
1567# endif
1568
1569#elif RT_ARCH_ARM64
1570 /*
1571 * ARM64:
1572 */
1573 off = iemNativeEmitLoadGprFromGpr(pReNative, off, IEMNATIVE_CALL_ARG0_GREG, IEMNATIVE_REG_FIXED_PVMCPU);
1574 if (cParams > 0)
1575 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG1_GREG, pCallEntry->auParams[0]);
1576 if (cParams > 1)
1577 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG2_GREG, pCallEntry->auParams[1]);
1578 if (cParams > 2)
1579 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_CALL_ARG3_GREG, pCallEntry->auParams[2]);
1580 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_REG_FIXED_TMP0,
1581 (uintptr_t)g_apfnIemThreadedFunctions[pCallEntry->enmFunction]);
1582
1583 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 1);
1584 AssertReturn(pu32CodeBuf, UINT32_MAX);
1585
1586 pu32CodeBuf[off++] = Armv8A64MkInstrBlr(IEMNATIVE_REG_FIXED_TMP0);
1587
1588#else
1589# error "port me"
1590#endif
1591
1592 /*
1593 * Check the status code.
1594 */
1595 off = iemNativeEmitCheckCallRetAndPassUp(pReNative, off, pCallEntry->idxInstr);
1596 AssertReturn(off != UINT32_MAX, off);
1597
1598 return off;
1599}
1600
1601
1602/**
1603 * Emits the RC fiddling code for handling non-zero return code or rcPassUp.
1604 */
1605static uint32_t iemNativeEmitRcFiddling(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint32_t idxReturnLabel)
1606{
1607 /*
1608 * Generate the rc + rcPassUp fiddling code if needed.
1609 */
1610 uint32_t idxLabel = iemNativeFindLabel(pReNative, kIemNativeLabelType_NonZeroRetOrPassUp);
1611 if (idxLabel != UINT32_MAX)
1612 {
1613 Assert(pReNative->paLabels[idxLabel].off == UINT32_MAX);
1614 pReNative->paLabels[idxLabel].off = off;
1615
1616 /* iemNativeHlpExecStatusCodeFiddling(PVMCPUCC pVCpu, int rc, uint8_t idxInstr) */
1617#ifdef RT_ARCH_AMD64
1618 /*
1619 * AMD64:
1620 */
1621 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
1622 AssertReturn(pbCodeBuf, UINT32_MAX);
1623
1624 /* Call helper and jump to return point. */
1625# ifdef RT_OS_WINDOWS
1626 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_x8, X86_GREG_xCX); /* cl = instruction number */
1627 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1628 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xCX, IEMNATIVE_REG_FIXED_PVMCPU);
1629 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1630 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDX, X86_GREG_xAX);
1631 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1632# else
1633 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDI, IEMNATIVE_REG_FIXED_PVMCPU);
1634 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1635 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xSI, X86_GREG_xAX);
1636 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1637 off = iemNativeEmitLoadGprFromGpr(pReNative, off, X86_GREG_xDX, X86_GREG_xCX); /* cl = instruction number */
1638 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1639# endif
1640 off = iemNativeEmitLoadGprImm64(pReNative, off, X86_GREG_xAX, (uintptr_t)iemNativeHlpExecStatusCodeFiddling);
1641 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1642
1643 pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 10);
1644 AssertReturn(pbCodeBuf, UINT32_MAX);
1645 pbCodeBuf[off++] = 0xff; /* call rax */
1646 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, 2, X86_GREG_xAX);
1647
1648 /* Jump to common return point. */
1649 uint32_t offRel = pReNative->paLabels[idxReturnLabel].off - (off + 2);
1650 if (-(int32_t)offRel <= 127)
1651 {
1652 pbCodeBuf[off++] = 0xeb; /* jmp rel8 */
1653 pbCodeBuf[off++] = (uint8_t)offRel;
1654 off++;
1655 }
1656 else
1657 {
1658 offRel -= 3;
1659 pbCodeBuf[off++] = 0xe9; /* jmp rel32 */
1660 pbCodeBuf[off++] = RT_BYTE1(offRel);
1661 pbCodeBuf[off++] = RT_BYTE2(offRel);
1662 pbCodeBuf[off++] = RT_BYTE3(offRel);
1663 pbCodeBuf[off++] = RT_BYTE4(offRel);
1664 }
1665 pbCodeBuf[off++] = 0xcc; /* int3 poison */
1666
1667#elif RT_ARCH_ARM64
1668 /*
1669 * ARM64:
1670 */
1671 off = iemNativeEmitLoadGprFromGpr(pReNative, off, IEMNATIVE_CALL_ARG1_GREG, IEMNATIVE_CALL_RET_GREG);
1672 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1673 off = iemNativeEmitLoadGprFromGpr(pReNative, off, IEMNATIVE_CALL_ARG0_GREG, IEMNATIVE_REG_FIXED_PVMCPU);
1674 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1675 /* IEMNATIVE_CALL_ARG2_GREG is already set. */
1676 off = iemNativeEmitLoadGprImm64(pReNative, off, IEMNATIVE_REG_FIXED_TMP0, (uintptr_t)iemNativeHlpExecStatusCodeFiddling);
1677 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1678
1679 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 2);
1680 AssertReturn(pu32CodeBuf, UINT32_MAX);
1681 pu32CodeBuf[off++] = Armv8A64MkInstrBlr(IEMNATIVE_REG_FIXED_TMP0);
1682
1683 /* Jump back to the common return point. */
1684 int32_t const offRel = pReNative->paLabels[idxReturnLabel].off - off;
1685 pu32CodeBuf[off++] = Armv8A64MkInstrB(offRel);
1686#else
1687# error "port me"
1688#endif
1689 }
1690 return off;
1691}
1692
1693
1694/**
1695 * Emits a standard epilog.
1696 */
1697static uint32_t iemNativeEmitEpilog(PIEMRECOMPILERSTATE pReNative, uint32_t off)
1698{
1699 /*
1700 * Successful return, so clear the return register (eax, w0).
1701 */
1702 off = iemNativeEmitGprZero(pReNative,off, IEMNATIVE_CALL_RET_GREG);
1703 AssertReturn(off != UINT32_MAX, UINT32_MAX);
1704
1705 /*
1706 * Define label for common return point.
1707 */
1708 uint32_t const idxReturn = iemNativeMakeLabel(pReNative, kIemNativeLabelType_Return, off);
1709 AssertReturn(idxReturn != UINT32_MAX, UINT32_MAX);
1710
1711 /*
1712 * Restore registers and return.
1713 */
1714#ifdef RT_ARCH_AMD64
1715 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 20);
1716 AssertReturn(pbCodeBuf, UINT32_MAX);
1717
1718 /* Reposition esp at the r15 restore point. */
1719 pbCodeBuf[off++] = X86_OP_REX_W;
1720 pbCodeBuf[off++] = 0x8d; /* lea rsp, [rbp - (gcc ? 5 : 7) * 8] */
1721 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_MEM1, X86_GREG_xSP, X86_GREG_xBP);
1722 pbCodeBuf[off++] = (uint8_t)IEMNATIVE_FP_OFF_LAST_PUSH;
1723
1724 /* Pop non-volatile registers and return */
1725 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r15 */
1726 pbCodeBuf[off++] = 0x58 + X86_GREG_x15 - 8;
1727 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r14 */
1728 pbCodeBuf[off++] = 0x58 + X86_GREG_x14 - 8;
1729 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r13 */
1730 pbCodeBuf[off++] = 0x58 + X86_GREG_x13 - 8;
1731 pbCodeBuf[off++] = X86_OP_REX_B; /* pop r12 */
1732 pbCodeBuf[off++] = 0x58 + X86_GREG_x12 - 8;
1733# ifdef RT_OS_WINDOWS
1734 pbCodeBuf[off++] = 0x58 + X86_GREG_xDI; /* pop rdi */
1735 pbCodeBuf[off++] = 0x58 + X86_GREG_xSI; /* pop rsi */
1736# endif
1737 pbCodeBuf[off++] = 0x58 + X86_GREG_xBX; /* pop rbx */
1738 pbCodeBuf[off++] = 0xc9; /* leave */
1739 pbCodeBuf[off++] = 0xc3; /* ret */
1740 pbCodeBuf[off++] = 0xcc; /* int3 poison */
1741
1742#elif RT_ARCH_ARM64
1743 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 10);
1744 AssertReturn(pu32CodeBuf, UINT32_MAX);
1745
1746 /* ldp x19, x20, [sp #IEMNATIVE_FRAME_VAR_SIZE]! ; Unallocate the variable space and restore x19+x20. */
1747 AssertCompile(IEMNATIVE_FRAME_VAR_SIZE < 64*8);
1748 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kPreIndex,
1749 ARMV8_A64_REG_X19, ARMV8_A64_REG_X20, ARMV8_A64_REG_SP,
1750 IEMNATIVE_FRAME_VAR_SIZE / 8);
1751 /* Restore x21 thru x28 + BP and LR (ret address) (SP remains unchanged in the kSigned variant). */
1752 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1753 ARMV8_A64_REG_X21, ARMV8_A64_REG_X22, ARMV8_A64_REG_SP, 2);
1754 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1755 ARMV8_A64_REG_X23, ARMV8_A64_REG_X24, ARMV8_A64_REG_SP, 4);
1756 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1757 ARMV8_A64_REG_X25, ARMV8_A64_REG_X26, ARMV8_A64_REG_SP, 6);
1758 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1759 ARMV8_A64_REG_X27, ARMV8_A64_REG_X28, ARMV8_A64_REG_SP, 8);
1760 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(true /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1761 ARMV8_A64_REG_BP, ARMV8_A64_REG_LR, ARMV8_A64_REG_SP, 10);
1762 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
1763
1764 /* add sp, sp, IEMNATIVE_FRAME_SAVE_REG_SIZE ; */
1765 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE < 4096);
1766 pu32CodeBuf[off++] = Armv8A64MkInstrAddSub(false /*fSub*/, ARMV8_A64_REG_SP, ARMV8_A64_REG_SP, IEMNATIVE_FRAME_SAVE_REG_SIZE);
1767
1768 /* retab / ret */
1769# ifdef RT_OS_DARWIN /** @todo See todo on pacibsp in the prolog. */
1770 if (1)
1771 pu32CodeBuf[off++] = ARMV8_A64_INSTR_RETAB;
1772 else
1773# endif
1774 pu32CodeBuf[off++] = ARMV8_A64_INSTR_RET;
1775
1776#else
1777# error "port me"
1778#endif
1779
1780 return iemNativeEmitRcFiddling(pReNative, off, idxReturn);
1781}
1782
1783
1784/**
1785 * Emits a standard prolog.
1786 */
1787static uint32_t iemNativeEmitProlog(PIEMRECOMPILERSTATE pReNative, uint32_t off)
1788{
1789#ifdef RT_ARCH_AMD64
1790 /*
1791 * Set up a regular xBP stack frame, pushing all non-volatile GPRs,
1792 * reserving 64 bytes for stack variables plus 4 non-register argument
1793 * slots. Fixed register assignment: xBX = pReNative;
1794 *
1795 * Since we always do the same register spilling, we can use the same
1796 * unwind description for all the code.
1797 */
1798 uint8_t *pbCodeBuf = iemNativeInstrBufEnsure(pReNative, off, 32);
1799 AssertReturn(pbCodeBuf, UINT32_MAX);
1800 pbCodeBuf[off++] = 0x50 + X86_GREG_xBP; /* push rbp */
1801 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbp, rsp */
1802 pbCodeBuf[off++] = 0x8b;
1803 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBP, X86_GREG_xSP);
1804 pbCodeBuf[off++] = 0x50 + X86_GREG_xBX; /* push rbx */
1805 AssertCompile(IEMNATIVE_REG_FIXED_PVMCPU == X86_GREG_xBX);
1806# ifdef RT_OS_WINDOWS
1807 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbx, rcx ; RBX = pVCpu */
1808 pbCodeBuf[off++] = 0x8b;
1809 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBX, X86_GREG_xCX);
1810 pbCodeBuf[off++] = 0x50 + X86_GREG_xSI; /* push rsi */
1811 pbCodeBuf[off++] = 0x50 + X86_GREG_xDI; /* push rdi */
1812# else
1813 pbCodeBuf[off++] = X86_OP_REX_W; /* mov rbx, rdi ; RBX = pVCpu */
1814 pbCodeBuf[off++] = 0x8b;
1815 pbCodeBuf[off++] = X86_MODRM_MAKE(X86_MOD_REG, X86_GREG_xBX, X86_GREG_xDI);
1816# endif
1817 pbCodeBuf[off++] = X86_OP_REX_B; /* push r12 */
1818 pbCodeBuf[off++] = 0x50 + X86_GREG_x12 - 8;
1819 pbCodeBuf[off++] = X86_OP_REX_B; /* push r13 */
1820 pbCodeBuf[off++] = 0x50 + X86_GREG_x13 - 8;
1821 pbCodeBuf[off++] = X86_OP_REX_B; /* push r14 */
1822 pbCodeBuf[off++] = 0x50 + X86_GREG_x14 - 8;
1823 pbCodeBuf[off++] = X86_OP_REX_B; /* push r15 */
1824 pbCodeBuf[off++] = 0x50 + X86_GREG_x15 - 8;
1825
1826 off = iemNativeEmitSubGprImm(pReNative, off, /* sub rsp, byte 28h */
1827 X86_GREG_xSP,
1828 IEMNATIVE_FRAME_ALIGN_SIZE
1829 + IEMNATIVE_FRAME_VAR_SIZE
1830 + IEMNATIVE_FRAME_STACK_ARG_COUNT * 8
1831 + IEMNATIVE_FRAME_SHADOW_ARG_COUNT * 8);
1832 AssertCompile(!(IEMNATIVE_FRAME_VAR_SIZE & 0xf));
1833 AssertCompile(!(IEMNATIVE_FRAME_STACK_ARG_COUNT & 0x1));
1834 AssertCompile(!(IEMNATIVE_FRAME_SHADOW_ARG_COUNT & 0x1));
1835
1836#elif RT_ARCH_ARM64
1837 /*
1838 * We set up a stack frame exactly like on x86, only we have to push the
1839 * return address our selves here. We save all non-volatile registers.
1840 */
1841 uint32_t *pu32CodeBuf = iemNativeInstrBufEnsure(pReNative, off, 10);
1842 AssertReturn(pu32CodeBuf, UINT32_MAX);
1843
1844# ifdef RT_OS_DARWIN /** @todo This seems to be requirement by libunwind for JIT FDEs. Investigate further as been unable
1845 * to figure out where the BRK following AUTHB*+XPACB* stuff comes from in libunwind. It's
1846 * definitely the dwarf stepping code, but till found it's very tedious to figure out whether it's
1847 * in any way conditional, so just emitting this instructions now and hoping for the best... */
1848 /* pacibsp */
1849 pu32CodeBuf[off++] = ARMV8_A64_INSTR_PACIBSP;
1850# endif
1851
1852 /* stp x19, x20, [sp, #-IEMNATIVE_FRAME_SAVE_REG_SIZE] ; Allocate space for saving registers and place x19+x20 at the bottom. */
1853 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE < 64*8);
1854 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kPreIndex,
1855 ARMV8_A64_REG_X19, ARMV8_A64_REG_X20, ARMV8_A64_REG_SP,
1856 -IEMNATIVE_FRAME_SAVE_REG_SIZE / 8);
1857 /* Save x21 thru x28 (SP remains unchanged in the kSigned variant). */
1858 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1859 ARMV8_A64_REG_X21, ARMV8_A64_REG_X22, ARMV8_A64_REG_SP, 2);
1860 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1861 ARMV8_A64_REG_X23, ARMV8_A64_REG_X24, ARMV8_A64_REG_SP, 4);
1862 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1863 ARMV8_A64_REG_X25, ARMV8_A64_REG_X26, ARMV8_A64_REG_SP, 6);
1864 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1865 ARMV8_A64_REG_X27, ARMV8_A64_REG_X28, ARMV8_A64_REG_SP, 8);
1866 /* Save the BP and LR (ret address) registers at the top of the frame. */
1867 pu32CodeBuf[off++] = Armv8A64MkInstrStLdPair(false /*fLoad*/, 2 /*64-bit*/, kArm64InstrStLdPairType_kSigned,
1868 ARMV8_A64_REG_BP, ARMV8_A64_REG_LR, ARMV8_A64_REG_SP, 10);
1869 AssertCompile(IEMNATIVE_FRAME_SAVE_REG_SIZE / 8 == 12);
1870 /* add bp, sp, IEMNATIVE_FRAME_SAVE_REG_SIZE - 16 ; Set BP to point to the old BP stack address. */
1871 pu32CodeBuf[off++] = Armv8A64MkInstrAddSub(false /*fSub*/, ARMV8_A64_REG_BP,
1872 ARMV8_A64_REG_SP, IEMNATIVE_FRAME_SAVE_REG_SIZE - 16);
1873
1874 /* sub sp, sp, IEMNATIVE_FRAME_VAR_SIZE ; Allocate the variable area from SP. */
1875 pu32CodeBuf[off++] = Armv8A64MkInstrAddSub(true /*fSub*/, ARMV8_A64_REG_SP, ARMV8_A64_REG_SP, IEMNATIVE_FRAME_VAR_SIZE);
1876
1877 /* mov r28, r0 */
1878 off = iemNativeEmitLoadGprFromGpr(pReNative, off, IEMNATIVE_REG_FIXED_PVMCPU, IEMNATIVE_CALL_ARG0_GREG);
1879
1880#else
1881# error "port me"
1882#endif
1883 return off;
1884}
1885
1886
1887DECLINLINE(uint32_t) iemNativeEmitCImplCall1(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxInstr,
1888 uintptr_t pfnCImpl, uint8_t cbInstr, uint64_t uArg0)
1889{
1890 return iemNativeEmitCImplCall(pReNative, off, idxInstr, pfnCImpl, cbInstr, 1, uArg0, 0, 0);
1891}
1892
1893
1894DECLINLINE(uint32_t) iemNativeEmitCImplCall2(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxInstr,
1895 uintptr_t pfnCImpl, uint8_t cbInstr, uint64_t uArg0, uint64_t uArg1)
1896{
1897 return iemNativeEmitCImplCall(pReNative, off, idxInstr, pfnCImpl, cbInstr, 2, uArg0, uArg1, 0);
1898}
1899
1900
1901DECLINLINE(uint32_t) iemNativeEmitCImplCall3(PIEMRECOMPILERSTATE pReNative, uint32_t off, uint8_t idxInstr,
1902 uintptr_t pfnCImpl, uint8_t cbInstr, uint64_t uArg0, uint64_t uArg1, uint64_t uArg2)
1903{
1904 return iemNativeEmitCImplCall(pReNative, off, idxInstr, pfnCImpl, cbInstr, 3, uArg0, uArg1, uArg2);
1905}
1906
1907
1908/*
1909 * MC definitions for the native recompiler.
1910 */
1911
1912#define IEM_MC_DEFER_TO_CIMPL_0_RET_THREADED(a_cbInstr, a_fFlags, a_pfnCImpl) \
1913 return iemNativeEmitCImplCall0(pReNative, off, pCallEntry->idxInstr, (uintptr_t)a_pfnCImpl, a_cbInstr) /** @todo not used ... */
1914
1915#define IEM_MC_DEFER_TO_CIMPL_1_RET_THREADED(a_cbInstr, a_fFlags, a_pfnCImpl, a0) \
1916 return iemNativeEmitCImplCall1(pReNative, off, pCallEntry->idxInstr, (uintptr_t)a_pfnCImpl, a_cbInstr, a0)
1917
1918#define IEM_MC_DEFER_TO_CIMPL_2_RET_THREADED(a_cbInstr, a_fFlags, a_pfnCImpl, a0, a1) \
1919 return iemNativeEmitCImplCall2(pReNative, off, pCallEntry->idxInstr, (uintptr_t)a_pfnCImpl, a_cbInstr, a0, a1)
1920
1921#define IEM_MC_DEFER_TO_CIMPL_3_RET_THREADED(a_cbInstr, a_fFlags, a_pfnCImpl, a0, a1, a2) \
1922 return iemNativeEmitCImplCall3(pReNative, off, pCallEntry->idxInstr, (uintptr_t)a_pfnCImpl, a_cbInstr, a0, a1, a2)
1923
1924/*
1925 * Builtin functions.
1926 */
1927
1928/**
1929 * Built-in function that calls a C-implemention function taking zero arguments.
1930 */
1931static IEM_DECL_IEMNATIVERECOMPFUNC_DEF(iemNativeRecompFunc_BltIn_DeferToCImpl0)
1932{
1933 PFNIEMCIMPL0 const pfnCImpl = (PFNIEMCIMPL0)(uintptr_t)pCallEntry->auParams[0];
1934 uint8_t const cbInstr = (uint8_t)pCallEntry->auParams[1];
1935 return iemNativeEmitCImplCall(pReNative, off, pCallEntry->idxInstr, (uintptr_t)pfnCImpl, cbInstr, 0, 0, 0, 0);
1936}
1937
1938
1939
1940/*
1941 * Include g_apfnIemNativeRecompileFunctions and associated functions.
1942 *
1943 * This should probably live in it's own file later, but lets see what the
1944 * compile times turn out to be first.
1945 */
1946#include "IEMNativeFunctions.cpp.h"
1947
1948
1949/**
1950 * Recompiles the given threaded TB into a native one.
1951 *
1952 * In case of failure the translation block will be returned as-is.
1953 *
1954 * @returns pTb.
1955 * @param pVCpu The cross context virtual CPU structure of the calling
1956 * thread.
1957 * @param pTb The threaded translation to recompile to native.
1958 */
1959PIEMTB iemNativeRecompile(PVMCPUCC pVCpu, PIEMTB pTb)
1960{
1961 /*
1962 * The first time thru, we allocate the recompiler state, the other times
1963 * we just need to reset it before using it again.
1964 */
1965 PIEMRECOMPILERSTATE pReNative = pVCpu->iem.s.pNativeRecompilerStateR3;
1966 if (RT_LIKELY(pReNative))
1967 iemNativeReInit(pReNative);
1968 else
1969 {
1970 pReNative = iemNativeInit(pVCpu);
1971 AssertReturn(pReNative, pTb);
1972 }
1973
1974 /*
1975 * Emit prolog code (fixed).
1976 */
1977 uint32_t off = iemNativeEmitProlog(pReNative, 0);
1978 AssertReturn(off != UINT32_MAX, pTb);
1979
1980 /*
1981 * Convert the calls to native code.
1982 */
1983 PCIEMTHRDEDCALLENTRY pCallEntry = pTb->Thrd.paCalls;
1984 uint32_t cCallsLeft = pTb->Thrd.cCalls;
1985 while (cCallsLeft-- > 0)
1986 {
1987 PFNIEMNATIVERECOMPFUNC const pfnRecom = g_apfnIemNativeRecompileFunctions[pCallEntry->enmFunction];
1988 if (pfnRecom) /** @todo stats on this. */
1989 off = pfnRecom(pReNative, off, pCallEntry);
1990 else
1991 off = iemNativeEmitThreadedCall(pReNative, off, pCallEntry);
1992 AssertReturn(off != UINT32_MAX, pTb);
1993
1994 pCallEntry++;
1995 }
1996
1997 /*
1998 * Emit the epilog code.
1999 */
2000 off = iemNativeEmitEpilog(pReNative, off);
2001 AssertReturn(off != UINT32_MAX, pTb);
2002
2003 /*
2004 * Make sure all labels has been defined.
2005 */
2006 PIEMNATIVELABEL const paLabels = pReNative->paLabels;
2007#ifdef VBOX_STRICT
2008 uint32_t const cLabels = pReNative->cLabels;
2009 for (uint32_t i = 0; i < cLabels; i++)
2010 AssertMsgReturn(paLabels[i].off < off, ("i=%d enmType=%d\n", i, paLabels[i].enmType), pTb);
2011#endif
2012
2013 /*
2014 * Allocate executable memory, copy over the code we've generated.
2015 */
2016 PIEMTBALLOCATOR const pTbAllocator = pVCpu->iem.s.pTbAllocatorR3;
2017 if (pTbAllocator->pDelayedFreeHead)
2018 iemTbAllocatorProcessDelayedFrees(pVCpu, pVCpu->iem.s.pTbAllocatorR3);
2019
2020 PIEMNATIVEINSTR const paFinalInstrBuf = (PIEMNATIVEINSTR)iemExecMemAllocatorAlloc(pVCpu, off * sizeof(IEMNATIVEINSTR));
2021 AssertReturn(paFinalInstrBuf, pTb);
2022 memcpy(paFinalInstrBuf, pReNative->pInstrBuf, off * sizeof(paFinalInstrBuf[0]));
2023
2024 /*
2025 * Apply fixups.
2026 */
2027 PIEMNATIVEFIXUP const paFixups = pReNative->paFixups;
2028 uint32_t const cFixups = pReNative->cFixups;
2029 for (uint32_t i = 0; i < cFixups; i++)
2030 {
2031 Assert(paFixups[i].off < off);
2032 Assert(paFixups[i].idxLabel < cLabels);
2033 RTPTRUNION const Ptr = { &paFinalInstrBuf[paFixups[i].off] };
2034 switch (paFixups[i].enmType)
2035 {
2036#if defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)
2037 case kIemNativeFixupType_Rel32:
2038 Assert(paFixups[i].off + 4 <= off);
2039 *Ptr.pi32 = paLabels[paFixups[i].idxLabel].off - paFixups[i].off + paFixups[i].offAddend;
2040 continue;
2041
2042#elif defined(RT_ARCH_ARM64)
2043 case kIemNativeFixupType_RelImm19At5:
2044 {
2045 Assert(paFixups[i].off < off);
2046 int32_t const offDisp = paLabels[paFixups[i].idxLabel].off - paFixups[i].off + paFixups[i].offAddend;
2047 Assert(offDisp >= -262144 && offDisp < 262144);
2048 *Ptr.pu32 = (*Ptr.pu32 & UINT32_C(0xff00001f)) | (offDisp << 5);
2049 continue;
2050 }
2051#endif
2052 case kIemNativeFixupType_Invalid:
2053 case kIemNativeFixupType_End:
2054 break;
2055 }
2056 AssertFailed();
2057 }
2058
2059 iemExecMemAllocatorReadyForUse(pVCpu, paFinalInstrBuf, off * sizeof(IEMNATIVEINSTR));
2060
2061 /*
2062 * Convert the translation block.
2063 */
2064 //RT_BREAKPOINT();
2065 RTMemFree(pTb->Thrd.paCalls);
2066 pTb->Native.paInstructions = paFinalInstrBuf;
2067 pTb->Native.cInstructions = off;
2068 pTb->fFlags = (pTb->fFlags & ~IEMTB_F_TYPE_MASK) | IEMTB_F_TYPE_NATIVE;
2069
2070 Assert(pTbAllocator->cThreadedTbs > 0);
2071 pTbAllocator->cThreadedTbs -= 1;
2072 pTbAllocator->cNativeTbs += 1;
2073 Assert(pTbAllocator->cNativeTbs <= pTbAllocator->cTotalTbs);
2074
2075 return pTb;
2076}
2077
注意: 瀏覽 TracBrowser 來幫助您使用儲存庫瀏覽器

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette