From be8fbafd71128eea97af91b8467556baddb7559a Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sat, 29 Jun 2024 18:12:30 +1000 Subject: [PATCH] CPU/CodeCache: Always dynamically allocate code buffer Reduces .bss size. --- src/common/memmap.cpp | 297 +++++++++++++++- src/common/memmap.h | 19 + src/core/cpu_code_cache.cpp | 57 +-- .../cpu_recompiler_code_generator_aarch32.cpp | 9 +- .../cpu_recompiler_code_generator_aarch64.cpp | 10 +- .../cpu_recompiler_code_generator_x64.cpp | 14 +- src/core/system.cpp | 39 +- src/core/system.h | 10 +- src/duckstation-qt/qthost.cpp | 45 ++- src/util/jit_code_buffer.cpp | 332 +----------------- src/util/jit_code_buffer.h | 18 +- src/util/platform_misc_win32.cpp | 1 + 12 files changed, 453 insertions(+), 398 deletions(-) diff --git a/src/common/memmap.cpp b/src/common/memmap.cpp index dcc3dfde1..a89c72826 100644 --- a/src/common/memmap.cpp +++ b/src/common/memmap.cpp @@ -11,24 +11,36 @@ #include "fmt/format.h" +#include + #if defined(_WIN32) #include "windows_headers.h" +#include #elif defined(__APPLE__) #ifdef __aarch64__ #include // pthread_jit_write_protect_np() #endif +#include +#include #include #include #include #include +#include #elif !defined(__ANDROID__) #include +#include #include #include #include #endif -Log_SetChannel(MemoryArena); +Log_SetChannel(MemMap); + +namespace MemMap { +/// Allocates RWX memory at the specified address. +static void* AllocateJITMemoryAt(const void* addr, size_t size); +} // namespace MemMap #ifdef _WIN32 @@ -90,6 +102,44 @@ void MemMap::UnmapSharedMemory(void* baseaddr, size_t size) Panic("Failed to unmap shared memory"); } +const void* MemMap::GetBaseAddress() +{ + const HMODULE mod = GetModuleHandleW(nullptr); + if (!mod) + return nullptr; + + MODULEINFO mi; + if (!GetModuleInformation(GetCurrentProcess(), mod, &mi, sizeof(mi))) + return mod; + + return mi.lpBaseOfDll; +} + +void* MemMap::AllocateJITMemoryAt(const void* addr, size_t size) +{ + void* ptr = static_cast(VirtualAlloc(const_cast(addr), size, + addr ? (MEM_RESERVE | MEM_COMMIT) : MEM_COMMIT, PAGE_EXECUTE_READWRITE)); + if (!ptr && !addr) [[unlikely]] + ERROR_LOG("VirtualAlloc(RWX, {}) for internal buffer failed: {}", size, GetLastError()); + + return ptr; +} + +void MemMap::ReleaseJITMemory(void* ptr, size_t size) +{ + if (!VirtualFree(ptr, 0, MEM_RELEASE)) + ERROR_LOG("Failed to free code pointer {}", static_cast(ptr)); +} + +#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64) + +void MemMap::FlushInstructionCache(void* address, size_t size) +{ + ::FlushInstructionCache(GetCurrentProcess(), address, size); +} + +#endif + SharedMemoryMappingArea::SharedMemoryMappingArea() = default; SharedMemoryMappingArea::~SharedMemoryMappingArea() @@ -346,6 +396,93 @@ void MemMap::UnmapSharedMemory(void* baseaddr, size_t size) Panic("Failed to unmap shared memory"); } +const void* MemMap::GetBaseAddress() +{ + u32 name_buffer_size = 0; + _NSGetExecutablePath(nullptr, &name_buffer_size); + if (name_buffer_size > 0) [[likely]] + { + std::unique_ptr name_buffer = std::make_unique_for_overwrite(name_buffer_size + 1); + if (_NSGetExecutablePath(name_buffer.get(), &name_buffer_size) == 0) [[likely]] + { + name_buffer[name_buffer_size] = 0; + + const struct segment_command_64* command = getsegbyname("__TEXT"); + if (command) [[likely]] + { + const u8* base = reinterpret_cast(command->vmaddr); + const u32 image_count = _dyld_image_count(); + for (u32 i = 0; i < image_count; i++) + { + if (std::strcmp(_dyld_get_image_name(i), name_buffer.get()) == 0) + return base + _dyld_get_image_vmaddr_slide(i); + } + } + } + } + + return reinterpret_cast(&GetBaseAddress); +} + +void* MemMap::AllocateJITMemoryAt(const void* addr, size_t size) +{ +#if !defined(__aarch64__) + kern_return_t ret = mach_vm_allocate(mach_task_self(), reinterpret_cast(&addr), size, + addr ? VM_FLAGS_FIXED : VM_FLAGS_ANYWHERE); + if (ret != KERN_SUCCESS) + { + ERROR_LOG("mach_vm_allocate() returned {}", ret); + return nullptr; + } + + ret = mach_vm_protect(mach_task_self(), reinterpret_cast(addr), size, false, + VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE); + if (ret != KERN_SUCCESS) + { + ERROR_LOG("mach_vm_protect() returned {}", ret); + mach_vm_deallocate(mach_task_self(), reinterpret_cast(addr), size); + return nullptr; + } + + return const_cast(addr); +#else + // On ARM64, we need to use MAP_JIT, which means we can't use MAP_FIXED. + if (addr) + return nullptr; + + constexpr int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT; + void* ptr = mmap(const_cast(addr), size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0); + if (ptr == MAP_FAILED) + { + ERROR_LOG("mmap(RWX, {}) for internal buffer failed: {}", size, errno); + return nullptr; + } + + return ptr; +#endif +} + +void MemMap::ReleaseJITMemory(void* ptr, size_t size) +{ +#if !defined(__aarch64__) + const kern_return_t res = mach_vm_deallocate(mach_task_self(), reinterpret_cast(ptr), size); + if (res != KERN_SUCCESS) + ERROR_LOG("mach_vm_deallocate() failed: {}", res); +#else + if (munmap(ptr, size) != 0) + ERROR_LOG("Failed to free code pointer {}", static_cast(ptr)); +#endif +} + +#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64) + +void MemMap::FlushInstructionCache(void* address, size_t size) +{ + __builtin___clear_cache(reinterpret_cast(address), reinterpret_cast(address) + size); +} + +#endif + SharedMemoryMappingArea::SharedMemoryMappingArea() = default; SharedMemoryMappingArea::~SharedMemoryMappingArea() @@ -531,6 +668,72 @@ void MemMap::UnmapSharedMemory(void* baseaddr, size_t size) Panic("Failed to unmap shared memory"); } +const void* MemMap::GetBaseAddress() +{ +#ifndef __APPLE__ + Dl_info info; + if (dladdr(reinterpret_cast(&GetBaseAddress), &info) == 0) + { + ERROR_LOG("dladdr() failed"); + return nullptr; + } + + return info.dli_fbase; +#else +#error Fixme +#endif +} + +void* MemMap::AllocateJITMemoryAt(const void* addr, size_t size) +{ + int flags = MAP_PRIVATE | MAP_ANONYMOUS; +#if defined(__linux__) + // Linux does the right thing, allows us to not disturb an existing mapping. + if (addr) + flags |= MAP_FIXED_NOREPLACE; +#elif defined(__FreeBSD__) + // FreeBSD achieves the same with MAP_FIXED and MAP_EXCL. + if (addr) + flags |= MAP_FIXED | MAP_EXCL; +#else + // Targeted mapping not available? + if (addr) + return nullptr; +#endif + + void* ptr = mmap(const_cast(addr), size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0); + if (ptr == MAP_FAILED) + { + if (!addr) + ERROR_LOG("mmap(RWX, {}) for internal buffer failed: {}", size, errno); + + return nullptr; + } + else if (addr && ptr != addr) [[unlikely]] + { + if (munmap(ptr, size) != 0) + ERROR_LOG("Failed to munmap() incorrectly hinted allocation: {}", errno); + return nullptr; + } + + return ptr; +} + +void MemMap::ReleaseJITMemory(void* ptr, size_t size) +{ + if (munmap(ptr, size) != 0) + ERROR_LOG("Failed to free code pointer {}", static_cast(ptr)); +} + +#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64) + +void MemMap::FlushInstructionCache(void* address, size_t size) +{ + __builtin___clear_cache(reinterpret_cast(address), reinterpret_cast(address) + size); +} + +#endif + SharedMemoryMappingArea::SharedMemoryMappingArea() = default; SharedMemoryMappingArea::~SharedMemoryMappingArea() @@ -591,3 +794,95 @@ bool SharedMemoryMappingArea::Unmap(void* map_base, size_t map_size) } #endif + +void* MemMap::AllocateJITMemory(size_t size) +{ + const u8* base = + reinterpret_cast(Common::AlignDownPow2(reinterpret_cast(GetBaseAddress()), HOST_PAGE_SIZE)); + u8* ptr = nullptr; +#if !defined(CPU_ARCH_ARM64) || !defined(__APPLE__) + +#if defined(CPU_ARCH_X64) + static constexpr size_t assume_binary_size = 64 * 1024 * 1024; + static constexpr size_t step = 64 * 1024 * 1024; + static constexpr size_t max_displacement = 0x80000000u; +#elif defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64) + static constexpr size_t assume_binary_size = 16 * 1024 * 1024; + static constexpr size_t step = 8 * 1024 * 1024; + static constexpr size_t max_displacement = + 1024 * 1024 * 1024; // technically 4GB, but we don't want to spend that much time trying +#elif defined(CPU_ARCH_ARM32) + static constexpr size_t assume_binary_size = 8 * 1024 * 1024; // Wishful thinking... + static constexpr size_t step = 2 * 1024 * 1024; + static constexpr size_t max_displacement = 32 * 1024 * 1024; +#else +#error Unhandled architecture. +#endif + + const size_t max_displacement_from_start = max_displacement - size; + Assert(size <= max_displacement); + + // Try to find a region in the max displacement range of the process base address. + // Assume that the DuckStation binary will at max be some size, release is currently around 12MB on Windows. + // Therefore the max offset is +/- 12MB + code_size. Try allocating in steps by incrementing the pointer, then if no + // address range is found, go backwards from the base address (which will probably fail). + const u8* min_address = + base - std::min(reinterpret_cast(base), static_cast(max_displacement_from_start)); + const u8* max_address = base + max_displacement_from_start; + VERBOSE_LOG("Base address: {}", static_cast(base)); + VERBOSE_LOG("Acceptable address range: {} - {}", static_cast(min_address), + static_cast(max_address)); + + // Start offset by the expected binary size. + for (const u8* current_address = base + assume_binary_size;; current_address += step) + { + VERBOSE_LOG("Trying {} (displacement 0x{:X})", static_cast(current_address), + static_cast(current_address - base)); + if ((ptr = static_cast(AllocateJITMemoryAt(current_address, size)))) + break; + + if ((reinterpret_cast(current_address) + step) > reinterpret_cast(max_address) || + (reinterpret_cast(current_address) + step) < reinterpret_cast(current_address)) + { + break; + } + } + + // Try before (will likely fail). + if (!ptr && reinterpret_cast(base) >= step) + { + for (const u8* current_address = base - step;; current_address -= step) + { + VERBOSE_LOG("Trying {} (displacement 0x{:X})", static_cast(current_address), + static_cast(base - current_address)); + if ((ptr = static_cast(AllocateJITMemoryAt(current_address, size)))) + break; + + if ((reinterpret_cast(current_address) - step) < reinterpret_cast(min_address) || + (reinterpret_cast(current_address) - step) > reinterpret_cast(current_address)) + { + break; + } + } + } + + if (!ptr) + { +#ifdef CPU_ARCH_X64 + ERROR_LOG("Failed to allocate JIT buffer in range, expect crashes."); +#endif + if (!(ptr = static_cast(AllocateJITMemoryAt(nullptr, size)))) + return ptr; + } +#else + // We cannot control where the buffer gets allocated on Apple Silicon. Hope for the best. + if (!(ptr = static_cast(AllocateJITMemoryAt(nullptr, size)))) + return ptr; +#endif + + INFO_LOG("Allocated JIT buffer of size {} at {} (0x{:X} bytes / {} MB away)", size, static_cast(ptr), + std::abs(static_cast(ptr - base)), + (std::abs(static_cast(ptr - base)) + (1024 * 1024 - 1)) / (1024 * 1024)); + + return ptr; +} diff --git a/src/common/memmap.h b/src/common/memmap.h index 288f62a6e..aa65cde93 100644 --- a/src/common/memmap.h +++ b/src/common/memmap.h @@ -58,6 +58,25 @@ void* MapSharedMemory(void* handle, size_t offset, void* baseaddr, size_t size, void UnmapSharedMemory(void* baseaddr, size_t size); bool MemProtect(void* baseaddr, size_t size, PageProtect mode); +/// Returns the base address for the current process. +const void* GetBaseAddress(); + +/// Allocates RWX memory in branch range from the base address. +void* AllocateJITMemory(size_t size); + +/// Releases RWX memory. +void ReleaseJITMemory(void* ptr, size_t size); + +/// Flushes the instruction cache on the host for the specified range. +/// Only needed outside of X86, X86 has coherent D/I cache. +#if !defined(CPU_ARCH_ARM32) && !defined(CPU_ARCH_ARM64) && !defined(CPU_ARCH_RISCV64) +// clang-format off +ALWAYS_INLINE static void FlushInstructionCache(void* address, size_t size) { } +// clang-format on +#else +void FlushInstructionCache(void* address, size_t size); +#endif + /// JIT write protect for Apple Silicon. Needs to be called prior to writing to any RWX pages. #if !defined(__APPLE__) || !defined(__aarch64__) // clang-format off diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 9658abd35..c5130529c 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -123,22 +123,27 @@ PerfScope MIPSPerfScope("MIPS"); #endif -// Currently remapping the code buffer doesn't work in macOS. TODO: Make dynamic instead... -#ifndef __APPLE__ -#define USE_STATIC_CODE_BUFFER 1 -#endif - #if defined(CPU_ARCH_ARM32) // Use a smaller code buffer size on AArch32 to have a better chance of being in range. -static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 16 * 1024 * 1024; -static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 8 * 1024 * 1024; +static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 20 * 1024 * 1024; +static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 4 * 1024 * 1024; #else -static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 32 * 1024 * 1024; +static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 48 * 1024 * 1024; static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 16 * 1024 * 1024; #endif -#ifdef USE_STATIC_CODE_BUFFER -alignas(HOST_PAGE_SIZE) static u8 s_code_storage[RECOMPILER_CODE_CACHE_SIZE + RECOMPILER_FAR_CODE_CACHE_SIZE]; +// On Linux ARM32/ARM64, we use a dedicated section in the ELF for storing code. +// This is because without ASLR, or on certain ASLR offsets, the sbrk() heap ends up immediately following the text/data +// sections, which means there isn't a large enough gap to fit within range on ARM32. +#if defined(__linux__) && (defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64)) +#define USE_CODE_BUFFER_SECTION 1 +#ifdef __clang__ +#pragma clang section bss = ".jitstorage" +__attribute__((aligned(HOST_PAGE_SIZE))) static u8 s_code_buffer_ptr[RECOMPILER_CODE_CACHE_SIZE]; +#pragma clang section bss = "" +#endif +#else +static u8* s_code_buffer_ptr = nullptr; #endif static JitCodeBuffer s_code_buffer; @@ -162,20 +167,26 @@ bool CPU::CodeCache::IsUsingFastmem() bool CPU::CodeCache::ProcessStartup(Error* error) { - AllocateLUTs(); - -#ifdef USE_STATIC_CODE_BUFFER - const bool has_buffer = - s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE, HOST_PAGE_SIZE); +#ifdef USE_CODE_BUFFER_SECTION + const u8* module_base = static_cast(MemMap::GetBaseAddress()); + INFO_LOG("Using JIT buffer section of size {} at {} (0x{:X} bytes / {} MB away)", sizeof(s_code_buffer_ptr), + static_cast(s_code_buffer_ptr), std::abs(static_cast(s_code_buffer_ptr - module_base)), + (std::abs(static_cast(s_code_buffer_ptr - module_base)) + (1024 * 1024 - 1)) / (1024 * 1024)); + const bool code_buffer_allocated = + MemMap::MemProtect(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, PageProtect::ReadWriteExecute); #else - const bool has_buffer = false; + s_code_buffer_ptr = static_cast(MemMap::AllocateJITMemory(RECOMPILER_CODE_CACHE_SIZE)); + const bool code_buffer_allocated = (s_code_buffer_ptr != nullptr); #endif - if (!has_buffer && !s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE)) + if (!code_buffer_allocated) [[unlikely]] { - Error::SetStringView(error, "Failed to initialize code space"); + Error::SetStringView(error, "Failed to allocate code storage. The log may contain more information, you will need " + "to run DuckStation with -earlyconsole in the command line."); return false; } + AllocateLUTs(); + if (!PageFaultHandler::Install(error)) return false; @@ -184,17 +195,21 @@ bool CPU::CodeCache::ProcessStartup(Error* error) void CPU::CodeCache::ProcessShutdown() { - s_code_buffer.Destroy(); DeallocateLUTs(); + +#ifndef USE_CODE_BUFFER_SECTION + MemMap::ReleaseJITMemory(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE); +#endif } void CPU::CodeCache::Initialize() { Assert(s_blocks.empty()); + // TODO: Reduce far code size when not using memory exceptions. if (IsUsingAnyRecompiler()) { - s_code_buffer.Reset(); + s_code_buffer.Reset(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE); CompileASMFunctions(); ResetCodeLUT(); } @@ -219,7 +234,7 @@ void CPU::CodeCache::Reset() if (IsUsingAnyRecompiler()) { ClearASMFunctions(); - s_code_buffer.Reset(); + s_code_buffer.Reset(s_code_buffer_ptr, RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE); CompileASMFunctions(); ResetCodeLUT(); } diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index 6f0c996be..b8be018e1 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -1,9 +1,10 @@ -// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "common/align.h" #include "common/assert.h" #include "common/log.h" +#include "common/memmap.h" #include "cpu_code_cache_private.h" #include "cpu_core.h" @@ -171,7 +172,7 @@ u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache) } if (flush_icache) - JitCodeBuffer::FlushInstructionCache(code, kA32InstructionSizeInBytes); + MemMap::FlushInstructionCache(code, kA32InstructionSizeInBytes); return kA32InstructionSizeInBytes; } @@ -202,7 +203,7 @@ u8* CPU::Recompiler::armGetJumpTrampoline(const void* target) s_trampoline_targets.emplace(target, offset); s_trampoline_used = offset + static_cast(size); - JitCodeBuffer::FlushInstructionCache(start, size); + MemMap::FlushInstructionCache(start, size); return start; } @@ -1790,7 +1791,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore for (s32 i = 0; i < nops; i++) emit.nop(); - JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size); + MemMap::FlushInstructionCache(host_pc, lbi.code_size); } void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 5ef283baf..52f0fb572 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -1,9 +1,11 @@ -// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "common/align.h" #include "common/assert.h" #include "common/log.h" +#include "common/memmap.h" + #include "cpu_code_cache_private.h" #include "cpu_core.h" #include "cpu_core_private.h" @@ -274,7 +276,7 @@ u8* CPU::Recompiler::armGetJumpTrampoline(const void* target) s_trampoline_targets.emplace(target, offset); s_trampoline_used = offset + static_cast(size); - JitCodeBuffer::FlushInstructionCache(start, size); + MemMap::FlushInstructionCache(start, size); return start; } @@ -316,7 +318,7 @@ u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache) const u32 new_code = B | Assembler::ImmUncondBranch(disp); std::memcpy(code, &new_code, sizeof(new_code)); if (flush_icache) - JitCodeBuffer::FlushInstructionCache(code, kInstructionSize); + MemMap::FlushInstructionCache(code, kInstructionSize); return kInstructionSize; } @@ -2100,7 +2102,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore for (s32 i = 0; i < nops; i++) emit.nop(); - JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size); + MemMap::FlushInstructionCache(host_pc, lbi.code_size); } void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 89ed92e11..8679cbf65 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -12,6 +12,7 @@ #include "common/align.h" #include "common/assert.h" #include "common/log.h" +#include "common/memmap.h" #ifdef CPU_ARCH_X64 @@ -1768,15 +1769,8 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size) void CodeGenerator::EmitCall(const void* ptr) { - if (Xbyak::inner::IsInInt32(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr()))) - { - m_emit->call(ptr); - } - else - { - m_emit->mov(GetHostReg64(RRETURN), reinterpret_cast(ptr)); - m_emit->call(GetHostReg64(RRETURN)); - } + DebugAssert(Xbyak::inner::IsInInt32(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr()))); + m_emit->call(ptr); } void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) @@ -2530,7 +2524,7 @@ void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::Loadstore for (s32 i = 0; i < nops; i++) cg.nop(); - JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size); + MemMap::FlushInstructionCache(host_pc, lbi.code_size); } void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) diff --git a/src/core/system.cpp b/src/core/system.cpp index d6f055b93..e2df61660 100644 --- a/src/core/system.cpp +++ b/src/core/system.cpp @@ -318,6 +318,34 @@ void System::CheckCacheLineSize() } } +bool System::Internal::ProcessStartup(Error* error) +{ + Common::Timer timer; + + // Allocate JIT memory as soon as possible. + if (!CPU::CodeCache::ProcessStartup(error)) + return false; + + // Fastmem alloc *must* come after JIT alloc, otherwise it tends to eat the 4GB region after the executable on MacOS. + if (!Bus::AllocateMemory(error)) + { + CPU::CodeCache::ProcessShutdown(); + return false; + } + + VERBOSE_LOG("Memory allocation took {} ms.", timer.GetTimeMilliseconds()); + + CheckCacheLineSize(); + + return true; +} + +void System::Internal::ProcessShutdown() +{ + Bus::ReleaseMemory(); + CPU::CodeCache::ProcessShutdown(); +} + bool System::Internal::CPUThreadInitialize(Error* error) { #ifdef _WIN32 @@ -332,17 +360,9 @@ bool System::Internal::CPUThreadInitialize(Error* error) } #endif - if (!CPU::CodeCache::ProcessStartup(error) || !Bus::AllocateMemory(error)) - { - CPUThreadShutdown(); - return false; - } - // This will call back to Host::LoadSettings() -> ReloadSources(). LoadSettings(false); - CheckCacheLineSize(); - #ifdef ENABLE_RAINTEGRATION if (Host::GetBaseBoolSettingValue("Cheevos", "UseRAIntegration", false)) Achievements::SwitchToRAIntegration(); @@ -377,9 +397,6 @@ void System::Internal::CPUThreadShutdown() InputManager::CloseSources(); - CPU::CodeCache::ProcessShutdown(); - Bus::ReleaseMemory(); - #ifdef _WIN32 CoUninitialize(); #endif diff --git a/src/core/system.h b/src/core/system.h index ded9af2d8..19119ee79 100644 --- a/src/core/system.h +++ b/src/core/system.h @@ -504,10 +504,16 @@ namespace Internal { /// Performs mandatory hardware checks. bool PerformEarlyHardwareChecks(Error* error); -/// Called on process startup. -bool CPUThreadInitialize(Error* error); +/// Called on process startup, as early as possible. +bool ProcessStartup(Error* error); /// Called on process shutdown. +void ProcessShutdown(); + +/// Called on CPU thread initialization. +bool CPUThreadInitialize(Error* error); + +/// Called on CPU thread shutdown. void CPUThreadShutdown(); /// Polls input, updates subsystems which are present while paused/inactive. diff --git a/src/duckstation-qt/qthost.cpp b/src/duckstation-qt/qthost.cpp index 09fc65fd6..256b9bdaa 100644 --- a/src/duckstation-qt/qthost.cpp +++ b/src/duckstation-qt/qthost.cpp @@ -90,6 +90,7 @@ static constexpr u32 GDB_SERVER_POLLING_INTERVAL = 1; ////////////////////////////////////////////////////////////////////////// namespace QtHost { static bool PerformEarlyHardwareChecks(); +static bool EarlyProcessStartup(); static void RegisterTypes(); static bool InitializeConfig(std::string settings_filename); static bool ShouldUsePortableMode(); @@ -128,11 +129,26 @@ EmuThread::EmuThread(QThread* ui_thread) : QThread(), m_ui_thread(ui_thread) EmuThread::~EmuThread() = default; +void QtHost::RegisterTypes() +{ + // Register any standard types we need elsewhere + qRegisterMetaType>("std::optional()"); + qRegisterMetaType>(); + qRegisterMetaType>("std::function"); + qRegisterMetaType>(); + qRegisterMetaType(); + qRegisterMetaType("GPURenderer"); + qRegisterMetaType("InputBindingKey"); + qRegisterMetaType("std::string"); + qRegisterMetaType>>( + "std::vector>"); +} + bool QtHost::PerformEarlyHardwareChecks() { Error error; const bool okay = System::Internal::PerformEarlyHardwareChecks(&error); - if (okay && !error.IsValid()) + if (okay && !error.IsValid()) [[likely]] return true; if (okay) @@ -149,19 +165,15 @@ bool QtHost::PerformEarlyHardwareChecks() return okay; } -void QtHost::RegisterTypes() +bool QtHost::EarlyProcessStartup() { - // Register any standard types we need elsewhere - qRegisterMetaType>("std::optional()"); - qRegisterMetaType>(); - qRegisterMetaType>("std::function"); - qRegisterMetaType>(); - qRegisterMetaType(); - qRegisterMetaType("GPURenderer"); - qRegisterMetaType("InputBindingKey"); - qRegisterMetaType("std::string"); - qRegisterMetaType>>( - "std::vector>"); + Error error; + if (System::Internal::ProcessStartup(&error)) [[likely]] + return true; + + QMessageBox::critical(nullptr, QStringLiteral("Process Startup Failed"), + QString::fromStdString(error.GetDescription())); + return false; } bool QtHost::InBatchMode() @@ -452,7 +464,7 @@ bool QtHost::InitializeConfig(std::string settings_filename) EmuFolders::EnsureFoldersExist(); MigrateSettings(); - // We need to create the console window early, otherwise it appears behind the main window. + // We need to create the console window early, otherwise it appears in front of the main window. if (!Log::IsConsoleOutputEnabled() && s_base_settings_interface->GetBoolValue("Logging", "LogToConsole", Settings::DEFAULT_LOG_TO_CONSOLE)) { @@ -2508,6 +2520,9 @@ int main(int argc, char* argv[]) if (!QtHost::ParseCommandLineParametersAndInitializeConfig(app, autoboot)) return EXIT_FAILURE; + if (!QtHost::EarlyProcessStartup()) + return EXIT_FAILURE; + // Remove any previous-version remanants. if (s_cleanup_after_update) AutoUpdaterDialog::cleanupAfterUpdate(); @@ -2581,5 +2596,7 @@ shutdown_and_exit: // Ensure log is flushed. Log::SetFileOutputParams(false, nullptr); + System::Internal::ProcessShutdown(); + return result; } diff --git a/src/util/jit_code_buffer.cpp b/src/util/jit_code_buffer.cpp index 2e4976a31..7b0357dd4 100644 --- a/src/util/jit_code_buffer.cpp +++ b/src/util/jit_code_buffer.cpp @@ -1,301 +1,40 @@ -// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "jit_code_buffer.h" #include "common/align.h" #include "common/assert.h" -#include "common/log.h" #include "common/memmap.h" #include - -Log_SetChannel(JitCodeBuffer); - -#if defined(_WIN32) -#include "common/windows_headers.h" -#else -#include -#include -#ifdef __APPLE__ -#include -#include -#endif -#endif +#include JitCodeBuffer::JitCodeBuffer() = default; -JitCodeBuffer::JitCodeBuffer(u32 size, u32 far_code_size) +JitCodeBuffer::~JitCodeBuffer() = default; + +void JitCodeBuffer::Reset(void* ptr, u32 size, u32 far_code_size /* = 0 */) { - if (!Allocate(size, far_code_size)) - Panic("Failed to allocate code space"); -} - -JitCodeBuffer::JitCodeBuffer(void* buffer, u32 size, u32 far_code_size, u32 guard_pages) -{ - if (!Initialize(buffer, size, far_code_size)) - Panic("Failed to initialize code space"); -} - -JitCodeBuffer::~JitCodeBuffer() -{ - Destroy(); -} - -bool JitCodeBuffer::Allocate(u32 size /* = 64 * 1024 * 1024 */, u32 far_code_size /* = 0 */) -{ - Destroy(); - - m_total_size = size + far_code_size; - -#ifdef CPU_ARCH_X64 - // Try to find a region in 32-bit range of ourselves. - // Assume that the DuckStation binary will at max be 256MB. Therefore the max offset is - // +/- 256MB + round_up_pow2(size). This'll be 512MB for the JITs. - static const u8 base_ptr = 0; - const u8* base = - reinterpret_cast(Common::AlignDownPow2(reinterpret_cast(&base_ptr), HOST_PAGE_SIZE)); - const u32 max_displacement = 0x80000000u - Common::NextPow2(256 * 1024 * 1024 + m_total_size); - const u8* max_address = ((base + max_displacement) < base) ? - reinterpret_cast(std::numeric_limits::max()) : - (base + max_displacement); - const u8* min_address = ((base - max_displacement) > base) ? nullptr : (base - max_displacement); - const u32 step = 64 * 1024 * 1024; - const u32 steps = static_cast(max_address - min_address) / step; - for (u32 offset = 0; offset < steps; offset++) - { - const u8* addr = max_address - (offset * step); - VERBOSE_LOG("Trying {} (base {}, offset {}, displacement 0x{:X})", static_cast(addr), - static_cast(base), offset, static_cast(addr - base)); - if (TryAllocateAt(addr)) - break; - } - if (m_code_ptr) - { - INFO_LOG("Allocated JIT buffer of size {} at {} (0x{:X} bytes away)", m_total_size, static_cast(m_code_ptr), - static_cast(m_code_ptr - base)); - } - else - { - ERROR_LOG("Failed to allocate JIT buffer in range, expect crashes."); - if (!TryAllocateAt(nullptr)) - return false; - } -#else - if (!TryAllocateAt(nullptr)) - return false; -#endif - - m_free_code_ptr = m_code_ptr; - m_code_size = size; - m_code_used = 0; - - m_far_code_ptr = static_cast(m_code_ptr) + size; - m_free_far_code_ptr = m_far_code_ptr; - m_far_code_size = far_code_size; - m_far_code_used = 0; - - m_old_protection = 0; - m_owns_buffer = true; - return true; -} - -bool JitCodeBuffer::TryAllocateAt(const void* addr) -{ -#if defined(_WIN32) - m_code_ptr = static_cast(VirtualAlloc(const_cast(addr), m_total_size, - addr ? (MEM_RESERVE | MEM_COMMIT) : MEM_COMMIT, PAGE_EXECUTE_READWRITE)); - if (!m_code_ptr) - { - if (!addr) - ERROR_LOG("VirtualAlloc(RWX, {}) for internal buffer failed: {}", m_total_size, GetLastError()); - return false; - } - - return true; -#elif defined(__APPLE__) && !defined(__aarch64__) - kern_return_t ret = mach_vm_allocate(mach_task_self(), reinterpret_cast(&addr), m_total_size, - addr ? VM_FLAGS_FIXED : VM_FLAGS_ANYWHERE); - if (ret != KERN_SUCCESS) - { - ERROR_LOG("mach_vm_allocate() returned {}", ret); - return false; - } - - ret = mach_vm_protect(mach_task_self(), reinterpret_cast(addr), m_total_size, false, - VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE); - if (ret != KERN_SUCCESS) - { - ERROR_LOG("mach_vm_protect() returned {}", ret); - mach_vm_deallocate(mach_task_self(), reinterpret_cast(addr), m_total_size); - return false; - } - - m_code_ptr = static_cast(const_cast(addr)); - return true; -#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__HAIKU__) || defined(__FreeBSD__) - int flags = MAP_PRIVATE | MAP_ANONYMOUS; -#if defined(__linux__) - // Linux does the right thing, allows us to not disturb an existing mapping. - if (addr) - flags |= MAP_FIXED_NOREPLACE; -#elif defined(__FreeBSD__) - // FreeBSD achieves the same with MAP_FIXED and MAP_EXCL. - if (addr) - flags |= MAP_FIXED | MAP_EXCL; -#elif defined(__APPLE__) - // On ARM64, we need to use MAP_JIT, which means we can't use MAP_FIXED. - if (addr) - return false; - flags |= MAP_JIT; -#endif - - m_code_ptr = - static_cast(mmap(const_cast(addr), m_total_size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, -1, 0)); - if (!m_code_ptr) - { - if (!addr) - ERROR_LOG("mmap(RWX, {}) for internal buffer failed: {}", m_total_size, errno); - - return false; - } - else if (addr && m_code_ptr != addr) - { - if (munmap(m_code_ptr, m_total_size) != 0) - ERROR_LOG("Failed to munmap() incorrectly hinted allocation: {}", errno); - m_code_ptr = nullptr; - return false; - } - - return true; -#else - return false; -#endif -} - -bool JitCodeBuffer::Initialize(void* buffer, u32 size, u32 far_code_size /* = 0 */, u32 guard_size /* = 0 */) -{ - Destroy(); - - if ((far_code_size > 0 && guard_size >= far_code_size) || (far_code_size + (guard_size * 2)) > size) - return false; - -#if defined(_WIN32) - DWORD old_protect = 0; - if (!VirtualProtect(buffer, size, PAGE_EXECUTE_READWRITE, &old_protect)) - { - ERROR_LOG("VirtualProtect(RWX) for external buffer failed: {}", GetLastError()); - return false; - } - - if (guard_size > 0) - { - DWORD old_guard_protect = 0; - u8* guard_at_end = (static_cast(buffer) + size) - guard_size; - if (!VirtualProtect(buffer, guard_size, PAGE_NOACCESS, &old_guard_protect) || - !VirtualProtect(guard_at_end, guard_size, PAGE_NOACCESS, &old_guard_protect)) - { - ERROR_LOG("VirtualProtect(NOACCESS) for guard page failed: {}", GetLastError()); - return false; - } - } - - m_code_ptr = static_cast(buffer); - m_old_protection = static_cast(old_protect); -#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__HAIKU__) || defined(__FreeBSD__) - if (mprotect(buffer, size, PROT_READ | PROT_WRITE | PROT_EXEC) != 0) - { - ERROR_LOG("mprotect(RWX) for external buffer failed: {}", errno); - return false; - } - - if (guard_size > 0) - { - u8* guard_at_end = (static_cast(buffer) + size) - guard_size; - if (mprotect(buffer, guard_size, PROT_NONE) != 0 || mprotect(guard_at_end, guard_size, PROT_NONE) != 0) - { - ERROR_LOG("mprotect(NONE) for guard page failed: {}", errno); - return false; - } - } - - // reasonable default? - m_code_ptr = static_cast(buffer); - m_old_protection = PROT_READ | PROT_WRITE; -#else - m_code_ptr = nullptr; -#endif - - if (!m_code_ptr) - return false; + Assert(far_code_size < size); m_total_size = size; - m_free_code_ptr = m_code_ptr + guard_size; - m_code_size = size - far_code_size - (guard_size * 2); + m_code_ptr = static_cast(ptr); + m_free_code_ptr = m_code_ptr; + m_code_size = size - far_code_size; m_code_used = 0; - m_far_code_ptr = static_cast(m_code_ptr) + m_code_size; + m_far_code_size = far_code_size; + m_far_code_ptr = (far_code_size > 0) ? (static_cast(m_code_ptr) + m_code_size) : nullptr; m_free_far_code_ptr = m_far_code_ptr; - m_far_code_size = far_code_size - guard_size; m_far_code_used = 0; - m_guard_size = guard_size; - m_owns_buffer = false; - return true; -} + MemMap::BeginCodeWrite(); -void JitCodeBuffer::Destroy() -{ - if (m_owns_buffer) - { -#if defined(_WIN32) - if (!VirtualFree(m_code_ptr, 0, MEM_RELEASE)) - ERROR_LOG("Failed to free code pointer {}", static_cast(m_code_ptr)); -#elif defined(__APPLE__) && !defined(__aarch64__) - const kern_return_t res = - mach_vm_deallocate(mach_task_self(), reinterpret_cast(m_code_ptr), m_total_size); - if (res != KERN_SUCCESS) - ERROR_LOG("mach_vm_deallocate() failed: {}", res); -#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__HAIKU__) || defined(__FreeBSD__) - if (munmap(m_code_ptr, m_total_size) != 0) - ERROR_LOG("Failed to free code pointer {}", static_cast(m_code_ptr)); -#endif - } - else if (m_code_ptr) - { -#if defined(_WIN32) - DWORD old_protect = 0; - if (!VirtualProtect(m_code_ptr, m_total_size, m_old_protection, &old_protect)) - ERROR_LOG("Failed to restore protection on {}", static_cast(m_code_ptr)); -#else - if (mprotect(m_code_ptr, m_total_size, m_old_protection) != 0) - ERROR_LOG("Failed to restore protection on {}", static_cast(m_code_ptr)); -#endif - } + std::memset(m_code_ptr, 0, m_total_size); + MemMap::FlushInstructionCache(m_code_ptr, m_total_size); - m_code_ptr = nullptr; - m_free_code_ptr = nullptr; - m_code_size = 0; - m_code_reserve_size = 0; - m_code_used = 0; - m_far_code_ptr = nullptr; - m_free_far_code_ptr = nullptr; - m_far_code_size = 0; - m_far_code_used = 0; - m_total_size = 0; - m_guard_size = 0; - m_old_protection = 0; - m_owns_buffer = false; -} - -void JitCodeBuffer::ReserveCode(u32 size) -{ - Assert(m_code_used == 0); - Assert(size < m_code_size); - - m_code_reserve_size += size; - m_free_code_ptr += size; - m_code_size -= size; + MemMap::EndCodeWrite(); } void JitCodeBuffer::CommitCode(u32 length) @@ -303,10 +42,7 @@ void JitCodeBuffer::CommitCode(u32 length) if (length == 0) return; -#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64) - // ARM instruction and data caches are not coherent, we need to flush after every block. - FlushInstructionCache(m_free_code_ptr, length); -#endif + MemMap::FlushInstructionCache(m_free_code_ptr, length); Assert(length <= (m_code_size - m_code_used)); m_free_code_ptr += length; @@ -318,36 +54,13 @@ void JitCodeBuffer::CommitFarCode(u32 length) if (length == 0) return; -#if defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) || defined(CPU_ARCH_RISCV64) - // ARM instruction and data caches are not coherent, we need to flush after every block. - FlushInstructionCache(m_free_far_code_ptr, length); -#endif + MemMap::FlushInstructionCache(m_free_far_code_ptr, length); Assert(length <= (m_far_code_size - m_far_code_used)); m_free_far_code_ptr += length; m_far_code_used += length; } -void JitCodeBuffer::Reset() -{ - MemMap::BeginCodeWrite(); - - m_free_code_ptr = m_code_ptr + m_guard_size + m_code_reserve_size; - m_code_used = 0; - std::memset(m_free_code_ptr, 0, m_code_size); - FlushInstructionCache(m_free_code_ptr, m_code_size); - - if (m_far_code_size > 0) - { - m_free_far_code_ptr = m_far_code_ptr; - m_far_code_used = 0; - std::memset(m_free_far_code_ptr, 0, m_far_code_size); - FlushInstructionCache(m_free_far_code_ptr, m_far_code_size); - } - - MemMap::EndCodeWrite(); -} - void JitCodeBuffer::Align(u32 alignment, u8 padding_value) { DebugAssert(Common::IsPow2(alignment)); @@ -359,14 +72,3 @@ void JitCodeBuffer::Align(u32 alignment, u8 padding_value) m_free_code_ptr += num_padding_bytes; m_code_used += num_padding_bytes; } - -void JitCodeBuffer::FlushInstructionCache(void* address, u32 size) -{ -#if defined(_WIN32) - ::FlushInstructionCache(GetCurrentProcess(), address, size); -#elif defined(__GNUC__) || defined(__clang__) - __builtin___clear_cache(reinterpret_cast(address), reinterpret_cast(address) + size); -#else -#error Unknown platform. -#endif -} diff --git a/src/util/jit_code_buffer.h b/src/util/jit_code_buffer.h index 08e586201..69ecb997d 100644 --- a/src/util/jit_code_buffer.h +++ b/src/util/jit_code_buffer.h @@ -8,17 +8,12 @@ class JitCodeBuffer { public: JitCodeBuffer(); - JitCodeBuffer(u32 size, u32 far_code_size); - JitCodeBuffer(void* buffer, u32 size, u32 far_code_size, u32 guard_size); ~JitCodeBuffer(); bool IsValid() const { return (m_code_ptr != nullptr); } - bool Allocate(u32 size = 64 * 1024 * 1024, u32 far_code_size = 0); - bool Initialize(void* buffer, u32 size, u32 far_code_size = 0, u32 guard_size = 0); - void Destroy(); - void Reset(); - + void Reset(void* ptr, u32 size, u32 far_code_size = 0); + ALWAYS_INLINE u8* GetCodePointer() const { return m_code_ptr; } ALWAYS_INLINE u32 GetTotalSize() const { return m_total_size; } ALWAYS_INLINE float GetUsedPct() const @@ -33,7 +28,6 @@ public: ALWAYS_INLINE u8* GetFreeCodePointer() const { return m_free_code_ptr; } ALWAYS_INLINE u32 GetFreeCodeSpace() const { return static_cast(m_code_size - m_code_used); } - void ReserveCode(u32 size); void CommitCode(u32 length); ALWAYS_INLINE u8* GetFreeFarCodePointer() const { return m_free_far_code_ptr; } @@ -44,12 +38,7 @@ public: /// Assumes alignment is a power-of-two. void Align(u32 alignment, u8 padding_value); - /// Flushes the instruction cache on the host for the specified range. - static void FlushInstructionCache(void* address, u32 size); - private: - bool TryAllocateAt(const void* addr); - u8* m_code_ptr = nullptr; u8* m_free_code_ptr = nullptr; u32 m_code_size = 0; @@ -62,7 +51,4 @@ private: u32 m_far_code_used = 0; u32 m_total_size = 0; - u32 m_guard_size = 0; - u32 m_old_protection = 0; - bool m_owns_buffer = false; }; diff --git a/src/util/platform_misc_win32.cpp b/src/util/platform_misc_win32.cpp index 2eb8ae4a1..a27040629 100644 --- a/src/util/platform_misc_win32.cpp +++ b/src/util/platform_misc_win32.cpp @@ -14,6 +14,7 @@ #include #include "common/windows_headers.h" +#include #include #include