CPU/CodeCache: Always dynamically allocate code buffer

Reduces .bss size.
This commit is contained in:
Stenzek 2024-06-29 18:12:30 +10:00
parent 0d3e674500
commit be8fbafd71
No known key found for this signature in database
12 changed files with 453 additions and 398 deletions

View file

@ -11,24 +11,36 @@
#include "fmt/format.h"
#include <memory>
#if defined(_WIN32)
#include "windows_headers.h"
#include <Psapi.h>
#elif defined(__APPLE__)
#ifdef __aarch64__
#include <pthread.h> // pthread_jit_write_protect_np()
#endif
#include <mach-o/dyld.h>
#include <mach-o/getsect.h>
#include <mach/mach_init.h>
#include <mach/mach_port.h>
#include <mach/mach_vm.h>
#include <mach/vm_map.h>
#include <sys/mman.h>
#elif !defined(__ANDROID__)
#include <cerrno>
#include <dlfcn.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#endif
Log_SetChannel(MemoryArena);
Log_SetChannel(MemMap);
namespace MemMap {
/// Allocates RWX memory at the specified address.
static void* AllocateJITMemoryAt(const void* addr, size_t size);
} // namespace MemMap
#ifdef _WIN32
@ -90,6 +102,44 @@ void MemMap::UnmapSharedMemory(void* baseaddr, size_t size)
Panic("Failed to unmap shared memory");
}
const void* MemMap::GetBaseAddress()
{
const HMODULE mod = GetModuleHandleW(nullptr);
if (!mod)
return nullptr;
MODULEINFO mi;
if (!GetModuleInformation(GetCurrentProcess(), mod, &mi, sizeof(mi)))
return mod;
return mi.lpBaseOfDll;
}
void* MemMap::AllocateJITMemoryAt(const void* addr, size_t size)
{
void* ptr = static_cast<u8*>(VirtualAlloc(const_cast<void*>(addr), size,
addr ? (MEM_RESERVE | MEM_COMMIT) : MEM_COMMIT, PAGE_EXECUTE_READWRITE));
if (!ptr && !addr) [[unlikely]]
ERROR_LOG("VirtualAlloc(RWX, {}) for internal buffer failed: {}", size, GetLastError());
return ptr;
}
void MemMap::ReleaseJITMemory(void* ptr, size_t size)
{
if (!VirtualFree(ptr, 0, MEM_RELEASE))
ERROR_LOG("Failed to free code pointer {}", static_cast<void*>(ptr));
}
#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64)
void MemMap::FlushInstructionCache(void* address, size_t size)
{
::FlushInstructionCache(GetCurrentProcess(), address, size);
}
#endif
SharedMemoryMappingArea::SharedMemoryMappingArea() = default;
SharedMemoryMappingArea::~SharedMemoryMappingArea()
@ -346,6 +396,93 @@ void MemMap::UnmapSharedMemory(void* baseaddr, size_t size)
Panic("Failed to unmap shared memory");
}
const void* MemMap::GetBaseAddress()
{
u32 name_buffer_size = 0;
_NSGetExecutablePath(nullptr, &name_buffer_size);
if (name_buffer_size > 0) [[likely]]
{
std::unique_ptr<char[]> name_buffer = std::make_unique_for_overwrite<char[]>(name_buffer_size + 1);
if (_NSGetExecutablePath(name_buffer.get(), &name_buffer_size) == 0) [[likely]]
{
name_buffer[name_buffer_size] = 0;
const struct segment_command_64* command = getsegbyname("__TEXT");
if (command) [[likely]]
{
const u8* base = reinterpret_cast<const u8*>(command->vmaddr);
const u32 image_count = _dyld_image_count();
for (u32 i = 0; i < image_count; i++)
{
if (std::strcmp(_dyld_get_image_name(i), name_buffer.get()) == 0)
return base + _dyld_get_image_vmaddr_slide(i);
}
}
}
}
return reinterpret_cast<const void*>(&GetBaseAddress);
}
void* MemMap::AllocateJITMemoryAt(const void* addr, size_t size)
{
#if !defined(__aarch64__)
kern_return_t ret = mach_vm_allocate(mach_task_self(), reinterpret_cast<mach_vm_address_t*>(&addr), size,
addr ? VM_FLAGS_FIXED : VM_FLAGS_ANYWHERE);
if (ret != KERN_SUCCESS)
{
ERROR_LOG("mach_vm_allocate() returned {}", ret);
return nullptr;
}
ret = mach_vm_protect(mach_task_self(), reinterpret_cast<mach_vm_address_t>(addr), size, false,
VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE);
if (ret != KERN_SUCCESS)
{
ERROR_LOG("mach_vm_protect() returned {}", ret);
mach_vm_deallocate(mach_task_self(), reinterpret_cast<mach_vm_address_t>(addr), size);
return nullptr;
}
return const_cast<void*>(addr);
#else
// On ARM64, we need to use MAP_JIT, which means we can't use MAP_FIXED.
if (addr)
return nullptr;
constexpr int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT;
void* ptr = mmap(const_cast<void*>(addr), size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0);
if (ptr == MAP_FAILED)
{
ERROR_LOG("mmap(RWX, {}) for internal buffer failed: {}", size, errno);
return nullptr;
}
return ptr;
#endif
}
void MemMap::ReleaseJITMemory(void* ptr, size_t size)
{
#if !defined(__aarch64__)
const kern_return_t res = mach_vm_deallocate(mach_task_self(), reinterpret_cast<mach_vm_address_t>(ptr), size);
if (res != KERN_SUCCESS)
ERROR_LOG("mach_vm_deallocate() failed: {}", res);
#else
if (munmap(ptr, size) != 0)
ERROR_LOG("Failed to free code pointer {}", static_cast<void*>(ptr));
#endif
}
#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64)
void MemMap::FlushInstructionCache(void* address, size_t size)
{
__builtin___clear_cache(reinterpret_cast<char*>(address), reinterpret_cast<char*>(address) + size);
}
#endif
SharedMemoryMappingArea::SharedMemoryMappingArea() = default;
SharedMemoryMappingArea::~SharedMemoryMappingArea()
@ -531,6 +668,72 @@ void MemMap::UnmapSharedMemory(void* baseaddr, size_t size)
Panic("Failed to unmap shared memory");
}
const void* MemMap::GetBaseAddress()
{
#ifndef __APPLE__
Dl_info info;
if (dladdr(reinterpret_cast<const void*>(&GetBaseAddress), &info) == 0)
{
ERROR_LOG("dladdr() failed");
return nullptr;
}
return info.dli_fbase;
#else
#error Fixme
#endif
}
void* MemMap::AllocateJITMemoryAt(const void* addr, size_t size)
{
int flags = MAP_PRIVATE | MAP_ANONYMOUS;
#if defined(__linux__)
// Linux does the right thing, allows us to not disturb an existing mapping.
if (addr)
flags |= MAP_FIXED_NOREPLACE;
#elif defined(__FreeBSD__)
// FreeBSD achieves the same with MAP_FIXED and MAP_EXCL.
if (addr)
flags |= MAP_FIXED | MAP_EXCL;
#else
// Targeted mapping not available?
if (addr)
return nullptr;
#endif
void* ptr = mmap(const_cast<void*>(addr), size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0);
if (ptr == MAP_FAILED)
{
if (!addr)
ERROR_LOG("mmap(RWX, {}) for internal buffer failed: {}", size, errno);
return nullptr;
}
else if (addr && ptr != addr) [[unlikely]]
{
if (munmap(ptr, size) != 0)
ERROR_LOG("Failed to munmap() incorrectly hinted allocation: {}", errno);
return nullptr;
}
return ptr;
}
void MemMap::ReleaseJITMemory(void* ptr, size_t size)
{
if (munmap(ptr, size) != 0)
ERROR_LOG("Failed to free code pointer {}", static_cast<void*>(ptr));
}
#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64)
void MemMap::FlushInstructionCache(void* address, size_t size)
{
__builtin___clear_cache(reinterpret_cast<char*>(address), reinterpret_cast<char*>(address) + size);
}
#endif
SharedMemoryMappingArea::SharedMemoryMappingArea() = default;
SharedMemoryMappingArea::~SharedMemoryMappingArea()
@ -591,3 +794,95 @@ bool SharedMemoryMappingArea::Unmap(void* map_base, size_t map_size)
}
#endif
void* MemMap::AllocateJITMemory(size_t size)
{
const u8* base =
reinterpret_cast<const u8*>(Common::AlignDownPow2(reinterpret_cast<uintptr_t>(GetBaseAddress()), HOST_PAGE_SIZE));
u8* ptr = nullptr;
#if !defined(CPU_ARCH_ARM64) || !defined(__APPLE__)
#if defined(CPU_ARCH_X64)
static constexpr size_t assume_binary_size = 64 * 1024 * 1024;
static constexpr size_t step = 64 * 1024 * 1024;
static constexpr size_t max_displacement = 0x80000000u;
#elif defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64)
static constexpr size_t assume_binary_size = 16 * 1024 * 1024;
static constexpr size_t step = 8 * 1024 * 1024;
static constexpr size_t max_displacement =
1024 * 1024 * 1024; // technically 4GB, but we don't want to spend that much time trying
#elif defined(CPU_ARCH_ARM32)
static constexpr size_t assume_binary_size = 8 * 1024 * 1024; // Wishful thinking...
static constexpr size_t step = 2 * 1024 * 1024;
static constexpr size_t max_displacement = 32 * 1024 * 1024;
#else
#error Unhandled architecture.
#endif
const size_t max_displacement_from_start = max_displacement - size;
Assert(size <= max_displacement);
// Try to find a region in the max displacement range of the process base address.
// Assume that the DuckStation binary will at max be some size, release is currently around 12MB on Windows.
// Therefore the max offset is +/- 12MB + code_size. Try allocating in steps by incrementing the pointer, then if no
// address range is found, go backwards from the base address (which will probably fail).
const u8* min_address =
base - std::min(reinterpret_cast<ptrdiff_t>(base), static_cast<ptrdiff_t>(max_displacement_from_start));
const u8* max_address = base + max_displacement_from_start;
VERBOSE_LOG("Base address: {}", static_cast<const void*>(base));
VERBOSE_LOG("Acceptable address range: {} - {}", static_cast<const void*>(min_address),
static_cast<const void*>(max_address));
// Start offset by the expected binary size.
for (const u8* current_address = base + assume_binary_size;; current_address += step)
{
VERBOSE_LOG("Trying {} (displacement 0x{:X})", static_cast<const void*>(current_address),
static_cast<ptrdiff_t>(current_address - base));
if ((ptr = static_cast<u8*>(AllocateJITMemoryAt(current_address, size))))
break;
if ((reinterpret_cast<uintptr_t>(current_address) + step) > reinterpret_cast<uintptr_t>(max_address) ||
(reinterpret_cast<uintptr_t>(current_address) + step) < reinterpret_cast<uintptr_t>(current_address))
{
break;
}
}
// Try before (will likely fail).
if (!ptr && reinterpret_cast<uintptr_t>(base) >= step)
{
for (const u8* current_address = base - step;; current_address -= step)
{
VERBOSE_LOG("Trying {} (displacement 0x{:X})", static_cast<const void*>(current_address),
static_cast<ptrdiff_t>(base - current_address));
if ((ptr = static_cast<u8*>(AllocateJITMemoryAt(current_address, size))))
break;
if ((reinterpret_cast<uintptr_t>(current_address) - step) < reinterpret_cast<uintptr_t>(min_address) ||
(reinterpret_cast<uintptr_t>(current_address) - step) > reinterpret_cast<uintptr_t>(current_address))
{
break;
}
}
}
if (!ptr)
{
#ifdef CPU_ARCH_X64
ERROR_LOG("Failed to allocate JIT buffer in range, expect crashes.");
#endif
if (!(ptr = static_cast<u8*>(AllocateJITMemoryAt(nullptr, size))))
return ptr;
}
#else
// We cannot control where the buffer gets allocated on Apple Silicon. Hope for the best.
if (!(ptr = static_cast<u8*>(AllocateJITMemoryAt(nullptr, size))))
return ptr;
#endif
INFO_LOG("Allocated JIT buffer of size {} at {} (0x{:X} bytes / {} MB away)", size, static_cast<void*>(ptr),
std::abs(static_cast<ptrdiff_t>(ptr - base)),
(std::abs(static_cast<ptrdiff_t>(ptr - base)) + (1024 * 1024 - 1)) / (1024 * 1024));
return ptr;
}

View file

@ -58,6 +58,25 @@ void* MapSharedMemory(void* handle, size_t offset, void* baseaddr, size_t size,
void UnmapSharedMemory(void* baseaddr, size_t size);
bool MemProtect(void* baseaddr, size_t size, PageProtect mode);
/// Returns the base address for the current process.
const void* GetBaseAddress();
/// Allocates RWX memory in branch range from the base address.
void* AllocateJITMemory(size_t size);
/// Releases RWX memory.
void ReleaseJITMemory(void* ptr, size_t size);
/// Flushes the instruction cache on the host for the specified range.
/// Only needed outside of X86, X86 has coherent D/I cache.
#if !defined(CPU_ARCH_ARM32) && !defined(CPU_ARCH_ARM64) && !defined(CPU_ARCH_RISCV64)
// clang-format off
ALWAYS_INLINE static void FlushInstructionCache(void* address, size_t size) { }
// clang-format on
#else
void FlushInstructionCache(void* address, size_t size);
#endif
/// JIT write protect for Apple Silicon. Needs to be called prior to writing to any RWX pages.
#if !defined(__APPLE__) || !defined(__aarch64__)
// clang-format off

View file

@ -123,22 +123,27 @@ PerfScope MIPSPerfScope("MIPS");
#endif
// Currently remapping the code buffer doesn't work in macOS. TODO: Make dynamic instead...
#ifndef __APPLE__
#define USE_STATIC_CODE_BUFFER 1
#endif
#if defined(CPU_ARCH_ARM32)
// Use a smaller code buffer size on AArch32 to have a better chance of being in range.
static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 16 * 1024 * 1024;
static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 8 * 1024 * 1024;
static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 20 * 1024 * 1024;
static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 4 * 1024 * 1024;
#else
static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 32 * 1024 * 1024;
static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 48 * 1024 * 1024;
static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 16 * 1024 * 1024;
#endif
#ifdef USE_STATIC_CODE_BUFFER
alignas(HOST_PAGE_SIZE) static u8 s_code_storage[RECOMPILER_CODE_CACHE_SIZE + RECOMPILER_FAR_CODE_CACHE_SIZE];
// On Linux ARM32/ARM64, we use a dedicated section in the ELF for storing code.
// This is because without ASLR, or on certain ASLR offsets, the sbrk() heap ends up immediately following the text/data
// sections, which means there isn't a large enough gap to fit within range on ARM32.
#if defined(__linux__) && (defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64))
#define USE_CODE_BUFFER_SECTION 1
#ifdef __clang__
#pragma clang section bss = ".jitstorage"
__attribute__((aligned(HOST_PAGE_SIZE))) static u8 s_code_buffer_ptr[RECOMPILER_CODE_CACHE_SIZE];
#pragma clang section bss = ""
#endif
#else
static u8* s_code_buffer_ptr = nullptr;
#endif
static JitCodeBuffer s_code_buffer;
@ -162,20 +167,26 @@ bool CPU::CodeCache::IsUsingFastmem()
bool CPU::CodeCache::ProcessStartup(Error* error)
{
AllocateLUTs();
#ifdef USE_STATIC_CODE_BUFFER
const bool has_buffer =
s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, HOST_PAGE_SIZE);
#ifdef USE_CODE_BUFFER_SECTION
const u8* module_base = static_cast<const u8*>(MemMap::GetBaseAddress());
INFO_LOG("Using JIT buffer section of size {} at {} (0x{:X} bytes / {} MB away)", sizeof(s_code_buffer_ptr),
static_cast<void*>(s_code_buffer_ptr), std::abs(static_cast<ptrdiff_t>(s_code_buffer_ptr - module_base)),
(std::abs(static_cast<ptrdiff_t>(s_code_buffer_ptr - module_base)) + (1024 * 1024 - 1)) / (1024 * 1024));
const bool code_buffer_allocated =
MemMap::MemProtect(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, PageProtect::ReadWriteExecute);
#else
const bool has_buffer = false;
s_code_buffer_ptr = static_cast<u8*>(MemMap::AllocateJITMemory(RECOMPILER_CODE_CACHE_SIZE));
const bool code_buffer_allocated = (s_code_buffer_ptr != nullptr);
#endif
if (!has_buffer && !s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE))
if (!code_buffer_allocated) [[unlikely]]
{
Error::SetStringView(error, "Failed to initialize code space");
Error::SetStringView(error, "Failed to allocate code storage. The log may contain more information, you will need "
"to run DuckStation with -earlyconsole in the command line.");
return false;
}
AllocateLUTs();
if (!PageFaultHandler::Install(error))
return false;
@ -184,17 +195,21 @@ bool CPU::CodeCache::ProcessStartup(Error* error)
void CPU::CodeCache::ProcessShutdown()
{
s_code_buffer.Destroy();
DeallocateLUTs();
#ifndef USE_CODE_BUFFER_SECTION
MemMap::ReleaseJITMemory(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE);
#endif
}
void CPU::CodeCache::Initialize()
{
Assert(s_blocks.empty());
// TODO: Reduce far code size when not using memory exceptions.
if (IsUsingAnyRecompiler())
{
s_code_buffer.Reset();
s_code_buffer.Reset(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE);
CompileASMFunctions();
ResetCodeLUT();
}
@ -219,7 +234,7 @@ void CPU::CodeCache::Reset()
if (IsUsingAnyRecompiler())
{
ClearASMFunctions();
s_code_buffer.Reset();
s_code_buffer.Reset(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE);
CompileASMFunctions();
ResetCodeLUT();
}

View file

@ -1,9 +1,10 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "common/memmap.h"
#include "cpu_code_cache_private.h"
#include "cpu_core.h"
@ -171,7 +172,7 @@ u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
}
if (flush_icache)
JitCodeBuffer::FlushInstructionCache(code, kA32InstructionSizeInBytes);
MemMap::FlushInstructionCache(code, kA32InstructionSizeInBytes);
return kA32InstructionSizeInBytes;
}
@ -202,7 +203,7 @@ u8* CPU::Recompiler::armGetJumpTrampoline(const void* target)
s_trampoline_targets.emplace(target, offset);
s_trampoline_used = offset + static_cast<u32>(size);
JitCodeBuffer::FlushInstructionCache(start, size);
MemMap::FlushInstructionCache(start, size);
return start;
}
@ -1790,7 +1791,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
MemMap::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)

View file

@ -1,9 +1,11 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "common/memmap.h"
#include "cpu_code_cache_private.h"
#include "cpu_core.h"
#include "cpu_core_private.h"
@ -274,7 +276,7 @@ u8* CPU::Recompiler::armGetJumpTrampoline(const void* target)
s_trampoline_targets.emplace(target, offset);
s_trampoline_used = offset + static_cast<u32>(size);
JitCodeBuffer::FlushInstructionCache(start, size);
MemMap::FlushInstructionCache(start, size);
return start;
}
@ -316,7 +318,7 @@ u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
const u32 new_code = B | Assembler::ImmUncondBranch(disp);
std::memcpy(code, &new_code, sizeof(new_code));
if (flush_icache)
JitCodeBuffer::FlushInstructionCache(code, kInstructionSize);
MemMap::FlushInstructionCache(code, kInstructionSize);
return kInstructionSize;
}
@ -2100,7 +2102,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
MemMap::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)

View file

@ -12,6 +12,7 @@
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "common/memmap.h"
#ifdef CPU_ARCH_X64
@ -1768,15 +1769,8 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size)
void CodeGenerator::EmitCall(const void* ptr)
{
if (Xbyak::inner::IsInInt32(reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(m_emit->getCurr())))
{
m_emit->call(ptr);
}
else
{
m_emit->mov(GetHostReg64(RRETURN), reinterpret_cast<size_t>(ptr));
m_emit->call(GetHostReg64(RRETURN));
}
DebugAssert(Xbyak::inner::IsInInt32(reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(m_emit->getCurr())));
m_emit->call(ptr);
}
void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr)
@ -2530,7 +2524,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore
for (s32 i = 0; i < nops; i++)
cg.nop();
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
MemMap::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)

View file

@ -318,6 +318,34 @@ void System::CheckCacheLineSize()
}
}
bool System::Internal::ProcessStartup(Error* error)
{
Common::Timer timer;
// Allocate JIT memory as soon as possible.
if (!CPU::CodeCache::ProcessStartup(error))
return false;
// Fastmem alloc *must* come after JIT alloc, otherwise it tends to eat the 4GB region after the executable on MacOS.
if (!Bus::AllocateMemory(error))
{
CPU::CodeCache::ProcessShutdown();
return false;
}
VERBOSE_LOG("Memory allocation took {} ms.", timer.GetTimeMilliseconds());
CheckCacheLineSize();
return true;
}
void System::Internal::ProcessShutdown()
{
Bus::ReleaseMemory();
CPU::CodeCache::ProcessShutdown();
}
bool System::Internal::CPUThreadInitialize(Error* error)
{
#ifdef _WIN32
@ -332,17 +360,9 @@ bool System::Internal::CPUThreadInitialize(Error* error)
}
#endif
if (!CPU::CodeCache::ProcessStartup(error) || !Bus::AllocateMemory(error))
{
CPUThreadShutdown();
return false;
}
// This will call back to Host::LoadSettings() -> ReloadSources().
LoadSettings(false);
CheckCacheLineSize();
#ifdef ENABLE_RAINTEGRATION
if (Host::GetBaseBoolSettingValue("Cheevos", "UseRAIntegration", false))
Achievements::SwitchToRAIntegration();
@ -377,9 +397,6 @@ void System::Internal::CPUThreadShutdown()
InputManager::CloseSources();
CPU::CodeCache::ProcessShutdown();
Bus::ReleaseMemory();
#ifdef _WIN32
CoUninitialize();
#endif

View file

@ -504,10 +504,16 @@ namespace Internal {
/// Performs mandatory hardware checks.
bool PerformEarlyHardwareChecks(Error* error);
/// Called on process startup.
bool CPUThreadInitialize(Error* error);
/// Called on process startup, as early as possible.
bool ProcessStartup(Error* error);
/// Called on process shutdown.
void ProcessShutdown();
/// Called on CPU thread initialization.
bool CPUThreadInitialize(Error* error);
/// Called on CPU thread shutdown.
void CPUThreadShutdown();
/// Polls input, updates subsystems which are present while paused/inactive.

View file

@ -90,6 +90,7 @@ static constexpr u32 GDB_SERVER_POLLING_INTERVAL = 1;
//////////////////////////////////////////////////////////////////////////
namespace QtHost {
static bool PerformEarlyHardwareChecks();
static bool EarlyProcessStartup();
static void RegisterTypes();
static bool InitializeConfig(std::string settings_filename);
static bool ShouldUsePortableMode();
@ -128,11 +129,26 @@ EmuThread::EmuThread(QThread* ui_thread) : QThread(), m_ui_thread(ui_thread)
EmuThread::~EmuThread() = default;
void QtHost::RegisterTypes()
{
// Register any standard types we need elsewhere
qRegisterMetaType<std::optional<WindowInfo>>("std::optional<WindowInfo>()");
qRegisterMetaType<std::optional<bool>>();
qRegisterMetaType<std::function<void()>>("std::function<void()>");
qRegisterMetaType<std::shared_ptr<SystemBootParameters>>();
qRegisterMetaType<const GameList::Entry*>();
qRegisterMetaType<GPURenderer>("GPURenderer");
qRegisterMetaType<InputBindingKey>("InputBindingKey");
qRegisterMetaType<std::string>("std::string");
qRegisterMetaType<std::vector<std::pair<std::string, std::string>>>(
"std::vector<std::pair<std::string, std::string>>");
}
bool QtHost::PerformEarlyHardwareChecks()
{
Error error;
const bool okay = System::Internal::PerformEarlyHardwareChecks(&error);
if (okay && !error.IsValid())
if (okay && !error.IsValid()) [[likely]]
return true;
if (okay)
@ -149,19 +165,15 @@ bool QtHost::PerformEarlyHardwareChecks()
return okay;
}
void QtHost::RegisterTypes()
bool QtHost::EarlyProcessStartup()
{
// Register any standard types we need elsewhere
qRegisterMetaType<std::optional<WindowInfo>>("std::optional<WindowInfo>()");
qRegisterMetaType<std::optional<bool>>();
qRegisterMetaType<std::function<void()>>("std::function<void()>");
qRegisterMetaType<std::shared_ptr<SystemBootParameters>>();
qRegisterMetaType<const GameList::Entry*>();
qRegisterMetaType<GPURenderer>("GPURenderer");
qRegisterMetaType<InputBindingKey>("InputBindingKey");
qRegisterMetaType<std::string>("std::string");
qRegisterMetaType<std::vector<std::pair<std::string, std::string>>>(
"std::vector<std::pair<std::string, std::string>>");
Error error;
if (System::Internal::ProcessStartup(&error)) [[likely]]
return true;
QMessageBox::critical(nullptr, QStringLiteral("Process Startup Failed"),
QString::fromStdString(error.GetDescription()));
return false;
}
bool QtHost::InBatchMode()
@ -452,7 +464,7 @@ bool QtHost::InitializeConfig(std::string settings_filename)
EmuFolders::EnsureFoldersExist();
MigrateSettings();
// We need to create the console window early, otherwise it appears behind the main window.
// We need to create the console window early, otherwise it appears in front of the main window.
if (!Log::IsConsoleOutputEnabled() &&
s_base_settings_interface->GetBoolValue("Logging", "LogToConsole", Settings::DEFAULT_LOG_TO_CONSOLE))
{
@ -2508,6 +2520,9 @@ int main(int argc, char* argv[])
if (!QtHost::ParseCommandLineParametersAndInitializeConfig(app, autoboot))
return EXIT_FAILURE;
if (!QtHost::EarlyProcessStartup())
return EXIT_FAILURE;
// Remove any previous-version remanants.
if (s_cleanup_after_update)
AutoUpdaterDialog::cleanupAfterUpdate();
@ -2581,5 +2596,7 @@ shutdown_and_exit:
// Ensure log is flushed.
Log::SetFileOutputParams(false, nullptr);
System::Internal::ProcessShutdown();
return result;
}

View file

@ -1,301 +1,40 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "jit_code_buffer.h"
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "common/memmap.h"
#include <algorithm>
Log_SetChannel(JitCodeBuffer);
#if defined(_WIN32)
#include "common/windows_headers.h"
#else
#include <errno.h>
#include <sys/mman.h>
#ifdef __APPLE__
#include <mach/mach_init.h>
#include <mach/mach_vm.h>
#endif
#endif
#include <cstring>
JitCodeBuffer::JitCodeBuffer() = default;
JitCodeBuffer::JitCodeBuffer(u32 size, u32 far_code_size)
JitCodeBuffer::~JitCodeBuffer() = default;
void JitCodeBuffer::Reset(void* ptr, u32 size, u32 far_code_size /* = 0 */)
{
if (!Allocate(size, far_code_size))
Panic("Failed to allocate code space");
}
JitCodeBuffer::JitCodeBuffer(void* buffer, u32 size, u32 far_code_size, u32 guard_pages)
{
if (!Initialize(buffer, size, far_code_size))
Panic("Failed to initialize code space");
}
JitCodeBuffer::~JitCodeBuffer()
{
Destroy();
}
bool JitCodeBuffer::Allocate(u32 size /* = 64 * 1024 * 1024 */, u32 far_code_size /* = 0 */)
{
Destroy();
m_total_size = size + far_code_size;
#ifdef CPU_ARCH_X64
// Try to find a region in 32-bit range of ourselves.
// Assume that the DuckStation binary will at max be 256MB. Therefore the max offset is
// +/- 256MB + round_up_pow2(size). This'll be 512MB for the JITs.
static const u8 base_ptr = 0;
const u8* base =
reinterpret_cast<const u8*>(Common::AlignDownPow2(reinterpret_cast<uintptr_t>(&base_ptr), HOST_PAGE_SIZE));
const u32 max_displacement = 0x80000000u - Common::NextPow2(256 * 1024 * 1024 + m_total_size);
const u8* max_address = ((base + max_displacement) < base) ?
reinterpret_cast<const u8*>(std::numeric_limits<uintptr_t>::max()) :
(base + max_displacement);
const u8* min_address = ((base - max_displacement) > base) ? nullptr : (base - max_displacement);
const u32 step = 64 * 1024 * 1024;
const u32 steps = static_cast<u32>(max_address - min_address) / step;
for (u32 offset = 0; offset < steps; offset++)
{
const u8* addr = max_address - (offset * step);
VERBOSE_LOG("Trying {} (base {}, offset {}, displacement 0x{:X})", static_cast<const void*>(addr),
static_cast<const void*>(base), offset, static_cast<ptrdiff_t>(addr - base));
if (TryAllocateAt(addr))
break;
}
if (m_code_ptr)
{
INFO_LOG("Allocated JIT buffer of size {} at {} (0x{:X} bytes away)", m_total_size, static_cast<void*>(m_code_ptr),
static_cast<ptrdiff_t>(m_code_ptr - base));
}
else
{
ERROR_LOG("Failed to allocate JIT buffer in range, expect crashes.");
if (!TryAllocateAt(nullptr))
return false;
}
#else
if (!TryAllocateAt(nullptr))
return false;
#endif
m_free_code_ptr = m_code_ptr;
m_code_size = size;
m_code_used = 0;
m_far_code_ptr = static_cast<u8*>(m_code_ptr) + size;
m_free_far_code_ptr = m_far_code_ptr;
m_far_code_size = far_code_size;
m_far_code_used = 0;
m_old_protection = 0;
m_owns_buffer = true;
return true;
}
bool JitCodeBuffer::TryAllocateAt(const void* addr)
{
#if defined(_WIN32)
m_code_ptr = static_cast<u8*>(VirtualAlloc(const_cast<void*>(addr), m_total_size,
addr ? (MEM_RESERVE | MEM_COMMIT) : MEM_COMMIT, PAGE_EXECUTE_READWRITE));
if (!m_code_ptr)
{
if (!addr)
ERROR_LOG("VirtualAlloc(RWX, {}) for internal buffer failed: {}", m_total_size, GetLastError());
return false;
}
return true;
#elif defined(__APPLE__) && !defined(__aarch64__)
kern_return_t ret = mach_vm_allocate(mach_task_self(), reinterpret_cast<mach_vm_address_t*>(&addr), m_total_size,
addr ? VM_FLAGS_FIXED : VM_FLAGS_ANYWHERE);
if (ret != KERN_SUCCESS)
{
ERROR_LOG("mach_vm_allocate() returned {}", ret);
return false;
}
ret = mach_vm_protect(mach_task_self(), reinterpret_cast<mach_vm_address_t>(addr), m_total_size, false,
VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE);
if (ret != KERN_SUCCESS)
{
ERROR_LOG("mach_vm_protect() returned {}", ret);
mach_vm_deallocate(mach_task_self(), reinterpret_cast<mach_vm_address_t>(addr), m_total_size);
return false;
}
m_code_ptr = static_cast<u8*>(const_cast<void*>(addr));
return true;
#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__HAIKU__) || defined(__FreeBSD__)
int flags = MAP_PRIVATE | MAP_ANONYMOUS;
#if defined(__linux__)
// Linux does the right thing, allows us to not disturb an existing mapping.
if (addr)
flags |= MAP_FIXED_NOREPLACE;
#elif defined(__FreeBSD__)
// FreeBSD achieves the same with MAP_FIXED and MAP_EXCL.
if (addr)
flags |= MAP_FIXED | MAP_EXCL;
#elif defined(__APPLE__)
// On ARM64, we need to use MAP_JIT, which means we can't use MAP_FIXED.
if (addr)
return false;
flags |= MAP_JIT;
#endif
m_code_ptr =
static_cast<u8*>(mmap(const_cast<void*>(addr), m_total_size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0));
if (!m_code_ptr)
{
if (!addr)
ERROR_LOG("mmap(RWX, {}) for internal buffer failed: {}", m_total_size, errno);
return false;
}
else if (addr && m_code_ptr != addr)
{
if (munmap(m_code_ptr, m_total_size) != 0)
ERROR_LOG("Failed to munmap() incorrectly hinted allocation: {}", errno);
m_code_ptr = nullptr;
return false;
}
return true;
#else
return false;
#endif
}
bool JitCodeBuffer::Initialize(void* buffer, u32 size, u32 far_code_size /* = 0 */, u32 guard_size /* = 0 */)
{
Destroy();
if ((far_code_size > 0 && guard_size >= far_code_size) || (far_code_size + (guard_size * 2)) > size)
return false;
#if defined(_WIN32)
DWORD old_protect = 0;
if (!VirtualProtect(buffer, size, PAGE_EXECUTE_READWRITE, &old_protect))
{
ERROR_LOG("VirtualProtect(RWX) for external buffer failed: {}", GetLastError());
return false;
}
if (guard_size > 0)
{
DWORD old_guard_protect = 0;
u8* guard_at_end = (static_cast<u8*>(buffer) + size) - guard_size;
if (!VirtualProtect(buffer, guard_size, PAGE_NOACCESS, &old_guard_protect) ||
!VirtualProtect(guard_at_end, guard_size, PAGE_NOACCESS, &old_guard_protect))
{
ERROR_LOG("VirtualProtect(NOACCESS) for guard page failed: {}", GetLastError());
return false;
}
}
m_code_ptr = static_cast<u8*>(buffer);
m_old_protection = static_cast<u32>(old_protect);
#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__HAIKU__) || defined(__FreeBSD__)
if (mprotect(buffer, size, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
{
ERROR_LOG("mprotect(RWX) for external buffer failed: {}", errno);
return false;
}
if (guard_size > 0)
{
u8* guard_at_end = (static_cast<u8*>(buffer) + size) - guard_size;
if (mprotect(buffer, guard_size, PROT_NONE) != 0 || mprotect(guard_at_end, guard_size, PROT_NONE) != 0)
{
ERROR_LOG("mprotect(NONE) for guard page failed: {}", errno);
return false;
}
}
// reasonable default?
m_code_ptr = static_cast<u8*>(buffer);
m_old_protection = PROT_READ | PROT_WRITE;
#else
m_code_ptr = nullptr;
#endif
if (!m_code_ptr)
return false;
Assert(far_code_size < size);
m_total_size = size;
m_free_code_ptr = m_code_ptr + guard_size;
m_code_size = size - far_code_size - (guard_size * 2);
m_code_ptr = static_cast<u8*>(ptr);
m_free_code_ptr = m_code_ptr;
m_code_size = size - far_code_size;
m_code_used = 0;
m_far_code_ptr = static_cast<u8*>(m_code_ptr) + m_code_size;
m_far_code_size = far_code_size;
m_far_code_ptr = (far_code_size > 0) ? (static_cast<u8*>(m_code_ptr) + m_code_size) : nullptr;
m_free_far_code_ptr = m_far_code_ptr;
m_far_code_size = far_code_size - guard_size;
m_far_code_used = 0;
m_guard_size = guard_size;
m_owns_buffer = false;
return true;
}
MemMap::BeginCodeWrite();
void JitCodeBuffer::Destroy()
{
if (m_owns_buffer)
{
#if defined(_WIN32)
if (!VirtualFree(m_code_ptr, 0, MEM_RELEASE))
ERROR_LOG("Failed to free code pointer {}", static_cast<void*>(m_code_ptr));
#elif defined(__APPLE__) && !defined(__aarch64__)
const kern_return_t res =
mach_vm_deallocate(mach_task_self(), reinterpret_cast<mach_vm_address_t>(m_code_ptr), m_total_size);
if (res != KERN_SUCCESS)
ERROR_LOG("mach_vm_deallocate() failed: {}", res);
#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__HAIKU__) || defined(__FreeBSD__)
if (munmap(m_code_ptr, m_total_size) != 0)
ERROR_LOG("Failed to free code pointer {}", static_cast<void*>(m_code_ptr));
#endif
}
else if (m_code_ptr)
{
#if defined(_WIN32)
DWORD old_protect = 0;
if (!VirtualProtect(m_code_ptr, m_total_size, m_old_protection, &old_protect))
ERROR_LOG("Failed to restore protection on {}", static_cast<void*>(m_code_ptr));
#else
if (mprotect(m_code_ptr, m_total_size, m_old_protection) != 0)
ERROR_LOG("Failed to restore protection on {}", static_cast<void*>(m_code_ptr));
#endif
}
std::memset(m_code_ptr, 0, m_total_size);
MemMap::FlushInstructionCache(m_code_ptr, m_total_size);
m_code_ptr = nullptr;
m_free_code_ptr = nullptr;
m_code_size = 0;
m_code_reserve_size = 0;
m_code_used = 0;
m_far_code_ptr = nullptr;
m_free_far_code_ptr = nullptr;
m_far_code_size = 0;
m_far_code_used = 0;
m_total_size = 0;
m_guard_size = 0;
m_old_protection = 0;
m_owns_buffer = false;
}
void JitCodeBuffer::ReserveCode(u32 size)
{
Assert(m_code_used == 0);
Assert(size < m_code_size);
m_code_reserve_size += size;
m_free_code_ptr += size;
m_code_size -= size;
MemMap::EndCodeWrite();
}
void JitCodeBuffer::CommitCode(u32 length)
@ -303,10 +42,7 @@ void JitCodeBuffer::CommitCode(u32 length)
if (length == 0)
return;
#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64)
// ARM instruction and data caches are not coherent, we need to flush after every block.
FlushInstructionCache(m_free_code_ptr, length);
#endif
MemMap::FlushInstructionCache(m_free_code_ptr, length);
Assert(length <= (m_code_size - m_code_used));
m_free_code_ptr += length;
@ -318,36 +54,13 @@ void JitCodeBuffer::CommitFarCode(u32 length)
if (length == 0)
return;
#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64)
// ARM instruction and data caches are not coherent, we need to flush after every block.
FlushInstructionCache(m_free_far_code_ptr, length);
#endif
MemMap::FlushInstructionCache(m_free_far_code_ptr, length);
Assert(length <= (m_far_code_size - m_far_code_used));
m_free_far_code_ptr += length;
m_far_code_used += length;
}
void JitCodeBuffer::Reset()
{
MemMap::BeginCodeWrite();
m_free_code_ptr = m_code_ptr + m_guard_size + m_code_reserve_size;
m_code_used = 0;
std::memset(m_free_code_ptr, 0, m_code_size);
FlushInstructionCache(m_free_code_ptr, m_code_size);
if (m_far_code_size > 0)
{
m_free_far_code_ptr = m_far_code_ptr;
m_far_code_used = 0;
std::memset(m_free_far_code_ptr, 0, m_far_code_size);
FlushInstructionCache(m_free_far_code_ptr, m_far_code_size);
}
MemMap::EndCodeWrite();
}
void JitCodeBuffer::Align(u32 alignment, u8 padding_value)
{
DebugAssert(Common::IsPow2(alignment));
@ -359,14 +72,3 @@ void JitCodeBuffer::Align(u32 alignment, u8 padding_value)
m_free_code_ptr += num_padding_bytes;
m_code_used += num_padding_bytes;
}
void JitCodeBuffer::FlushInstructionCache(void* address, u32 size)
{
#if defined(_WIN32)
::FlushInstructionCache(GetCurrentProcess(), address, size);
#elif defined(__GNUC__) || defined(__clang__)
__builtin___clear_cache(reinterpret_cast<char*>(address), reinterpret_cast<char*>(address) + size);
#else
#error Unknown platform.
#endif
}

View file

@ -8,17 +8,12 @@ class JitCodeBuffer
{
public:
JitCodeBuffer();
JitCodeBuffer(u32 size, u32 far_code_size);
JitCodeBuffer(void* buffer, u32 size, u32 far_code_size, u32 guard_size);
~JitCodeBuffer();
bool IsValid() const { return (m_code_ptr != nullptr); }
bool Allocate(u32 size = 64 * 1024 * 1024, u32 far_code_size = 0);
bool Initialize(void* buffer, u32 size, u32 far_code_size = 0, u32 guard_size = 0);
void Destroy();
void Reset();
void Reset(void* ptr, u32 size, u32 far_code_size = 0);
ALWAYS_INLINE u8* GetCodePointer() const { return m_code_ptr; }
ALWAYS_INLINE u32 GetTotalSize() const { return m_total_size; }
ALWAYS_INLINE float GetUsedPct() const
@ -33,7 +28,6 @@ public:
ALWAYS_INLINE u8* GetFreeCodePointer() const { return m_free_code_ptr; }
ALWAYS_INLINE u32 GetFreeCodeSpace() const { return static_cast<u32>(m_code_size - m_code_used); }
void ReserveCode(u32 size);
void CommitCode(u32 length);
ALWAYS_INLINE u8* GetFreeFarCodePointer() const { return m_free_far_code_ptr; }
@ -44,12 +38,7 @@ public:
/// Assumes alignment is a power-of-two.
void Align(u32 alignment, u8 padding_value);
/// Flushes the instruction cache on the host for the specified range.
static void FlushInstructionCache(void* address, u32 size);
private:
bool TryAllocateAt(const void* addr);
u8* m_code_ptr = nullptr;
u8* m_free_code_ptr = nullptr;
u32 m_code_size = 0;
@ -62,7 +51,4 @@ private:
u32 m_far_code_used = 0;
u32 m_total_size = 0;
u32 m_guard_size = 0;
u32 m_old_protection = 0;
bool m_owns_buffer = false;
};

View file

@ -14,6 +14,7 @@
#include <memory>
#include "common/windows_headers.h"
#include <Psapi.h>
#include <WinSock2.h>
#include <mmsystem.h>