CPU/CodeCache: Rewrite using new-rec's block management

This commit is contained in:
Stenzek 2023-10-04 00:19:17 +10:00
parent f82d08e223
commit 79e1ae3e54
No known key found for this signature in database
29 changed files with 3865 additions and 2520 deletions

View file

@ -38,6 +38,8 @@ add_library(common
minizip_helpers.cpp
minizip_helpers.h
path.h
perf_scope.cpp
perf_scope.h
progress_callback.cpp
progress_callback.h
rectangle.h

View file

@ -28,6 +28,7 @@
<ClInclude Include="memory_settings_interface.h" />
<ClInclude Include="md5_digest.h" />
<ClInclude Include="path.h" />
<ClInclude Include="perf_scope.h" />
<ClInclude Include="progress_callback.h" />
<ClInclude Include="rectangle.h" />
<ClInclude Include="scoped_guard.h" />
@ -59,6 +60,7 @@
<ClCompile Include="memory_settings_interface.cpp" />
<ClCompile Include="md5_digest.cpp" />
<ClCompile Include="minizip_helpers.cpp" />
<ClCompile Include="perf_scope.cpp" />
<ClCompile Include="progress_callback.cpp" />
<ClCompile Include="sha1_digest.cpp" />
<ClCompile Include="small_string.cpp" />

View file

@ -43,6 +43,7 @@
<ClInclude Include="fastjmp.h" />
<ClInclude Include="memmap.h" />
<ClInclude Include="intrin.h" />
<ClInclude Include="perf_scope.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="small_string.cpp" />
@ -69,6 +70,7 @@
<ClCompile Include="sha1_digest.cpp" />
<ClCompile Include="fastjmp.cpp" />
<ClCompile Include="memmap.cpp" />
<ClCompile Include="perf_scope.cpp" />
</ItemGroup>
<ItemGroup>
<Natvis Include="bitfield.natvis" />

View file

@ -28,7 +28,7 @@
#endif
template<typename T>
static inline void MemsetPtrs(T* ptr, T value, u32 count)
ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
{
static_assert(std::is_pointer_v<T>, "T is pointer type");
static_assert(sizeof(T) == sizeof(void*), "T isn't a fat pointer");

198
src/common/perf_scope.cpp Normal file
View file

@ -0,0 +1,198 @@
// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#include "perf_scope.h"
#include "assert.h"
#include "string_util.h"
#include <array>
#include <cstring>
#ifdef __linux__
#include <atomic>
#include <ctime>
#include <elf.h>
#include <mutex>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
// #define ProfileWithPerf
// #define ProfileWithPerfJitDump
// Perf is only supported on linux
#if defined(__linux__) && defined(ProfileWithPerf)
static std::FILE* s_map_file = nullptr;
static bool s_map_file_opened = false;
static std::mutex s_mutex;
static void RegisterMethod(const void* ptr, size_t size, const char* symbol)
{
std::unique_lock lock(s_mutex);
if (!s_map_file)
{
if (s_map_file_opened)
return;
char file[256];
snprintf(file, std::size(file), "/tmp/perf-%d.map", getpid());
s_map_file = std::fopen(file, "wb");
s_map_file_opened = true;
if (!s_map_file)
return;
}
std::fprintf(s_map_file, "%" PRIx64 " %zx %s\n", static_cast<u64>(reinterpret_cast<uintptr_t>(ptr)), size, symbol);
std::fflush(s_map_file);
}
#elif defined(__linux__) && defined(ProfileWithPerfJitDump)
enum : u32
{
JIT_CODE_LOAD = 0,
JIT_CODE_MOVE = 1,
JIT_CODE_DEBUG_INFO = 2,
JIT_CODE_CLOSE = 3,
JIT_CODE_UNWINDING_INFO = 4
};
#pragma pack(push, 1)
struct JITDUMP_HEADER
{
u32 magic = 0x4A695444; // JiTD
u32 version = 1;
u32 header_size = sizeof(JITDUMP_HEADER);
u32 elf_mach;
u32 pad1 = 0;
u32 pid;
u64 timestamp;
u64 flags = 0;
};
struct JITDUMP_RECORD_HEADER
{
u32 id;
u32 total_size;
u64 timestamp;
};
struct JITDUMP_CODE_LOAD
{
JITDUMP_RECORD_HEADER header;
u32 pid;
u32 tid;
u64 vma;
u64 code_addr;
u64 code_size;
u64 code_index;
// name
};
#pragma pack(pop)
static u64 JitDumpTimestamp()
{
struct timespec ts = {};
clock_gettime(CLOCK_MONOTONIC, &ts);
return (static_cast<u64>(ts.tv_sec) * 1000000000ULL) + static_cast<u64>(ts.tv_nsec);
}
static FILE* s_jitdump_file = nullptr;
static bool s_jitdump_file_opened = false;
static std::mutex s_jitdump_mutex;
static u32 s_jitdump_record_id;
static void RegisterMethod(const void* ptr, size_t size, const char* symbol)
{
const u32 namelen = std::strlen(symbol) + 1;
std::unique_lock lock(s_jitdump_mutex);
if (!s_jitdump_file)
{
if (!s_jitdump_file_opened)
{
char file[256];
snprintf(file, std::size(file), "jit-%d.dump", getpid());
s_jitdump_file = fopen(file, "w+b");
s_jitdump_file_opened = true;
if (!s_jitdump_file)
return;
}
void* perf_marker = mmap(nullptr, 4096, PROT_READ | PROT_EXEC, MAP_PRIVATE, fileno(s_jitdump_file), 0);
AssertMsg(perf_marker != MAP_FAILED, "Map perf marker");
JITDUMP_HEADER jh = {};
#if defined(__aarch64__)
jh.elf_mach = EM_AARCH64;
#else
jh.elf_mach = EM_X86_64;
#endif
jh.pid = getpid();
jh.timestamp = JitDumpTimestamp();
std::fwrite(&jh, sizeof(jh), 1, s_jitdump_file);
}
JITDUMP_CODE_LOAD cl = {};
cl.header.id = JIT_CODE_LOAD;
cl.header.total_size = sizeof(cl) + namelen + static_cast<u32>(size);
cl.header.timestamp = JitDumpTimestamp();
cl.pid = getpid();
cl.tid = syscall(SYS_gettid);
cl.vma = 0;
cl.code_addr = static_cast<u64>(reinterpret_cast<uintptr_t>(ptr));
cl.code_size = static_cast<u64>(size);
cl.code_index = s_jitdump_record_id++;
std::fwrite(&cl, sizeof(cl), 1, s_jitdump_file);
std::fwrite(symbol, namelen, 1, s_jitdump_file);
std::fwrite(ptr, size, 1, s_jitdump_file);
std::fflush(s_jitdump_file);
}
#endif
#if defined(__linux__) && (defined(ProfileWithPerf) || defined(ProfileWithPerfJitDump))
void PerfScope::Register(const void* ptr, size_t size, const char* symbol)
{
char full_symbol[128];
if (HasPrefix())
std::snprintf(full_symbol, std::size(full_symbol), "%s_%s", m_prefix, symbol);
else
StringUtil::Strlcpy(full_symbol, symbol, std::size(full_symbol));
RegisterMethod(ptr, size, full_symbol);
}
void PerfScope::RegisterPC(const void* ptr, size_t size, u32 pc)
{
char full_symbol[128];
if (HasPrefix())
std::snprintf(full_symbol, std::size(full_symbol), "%s_%08X", m_prefix, pc);
else
std::snprintf(full_symbol, std::size(full_symbol), "%08X", pc);
RegisterMethod(ptr, size, full_symbol);
}
void PerfScope::RegisterKey(const void* ptr, size_t size, const char* prefix, u64 key)
{
char full_symbol[128];
if (HasPrefix())
std::snprintf(full_symbol, std::size(full_symbol), "%s_%s%016" PRIX64, m_prefix, prefix, key);
else
std::snprintf(full_symbol, std::size(full_symbol), "%s%016" PRIX64, prefix, key);
RegisterMethod(ptr, size, full_symbol);
}
#else
void PerfScope::Register(const void* ptr, size_t size, const char* symbol)
{
}
void PerfScope::RegisterPC(const void* ptr, size_t size, u32 pc)
{
}
void PerfScope::RegisterKey(const void* ptr, size_t size, const char* prefix, u64 key)
{
}
#endif

20
src/common/perf_scope.h Normal file
View file

@ -0,0 +1,20 @@
// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#pragma once
#include "types.h"
class PerfScope
{
public:
constexpr PerfScope(const char* prefix) : m_prefix(prefix) {}
bool HasPrefix() const { return (m_prefix && m_prefix[0]); }
void Register(const void* ptr, size_t size, const char* symbol);
void RegisterPC(const void* ptr, size_t size, u32 pc);
void RegisterKey(const void* ptr, size_t size, const char* prefix, u64 key);
private:
const char* m_prefix;
};

View file

@ -19,6 +19,7 @@ add_library(core
controller.h
cpu_code_cache.cpp
cpu_code_cache.h
cpu_code_cache_private.h
cpu_core.cpp
cpu_core.h
cpu_core_private.h

View file

@ -85,8 +85,8 @@ enum : TickCount
enum : u32
{
RAM_2MB_CODE_PAGE_COUNT = (RAM_2MB_SIZE + (HOST_PAGE_SIZE + 1)) / HOST_PAGE_SIZE,
RAM_8MB_CODE_PAGE_COUNT = (RAM_8MB_SIZE + (HOST_PAGE_SIZE + 1)) / HOST_PAGE_SIZE,
RAM_2MB_CODE_PAGE_COUNT = (RAM_2MB_SIZE + (HOST_PAGE_SIZE - 1)) / HOST_PAGE_SIZE,
RAM_8MB_CODE_PAGE_COUNT = (RAM_8MB_SIZE + (HOST_PAGE_SIZE - 1)) / HOST_PAGE_SIZE,
MEMORY_LUT_PAGE_SIZE = 4096,
MEMORY_LUT_PAGE_SHIFT = 12,

View file

@ -85,6 +85,7 @@
<ClInclude Include="cdrom_async_reader.h" />
<ClInclude Include="cheats.h" />
<ClInclude Include="achievements.h" />
<ClInclude Include="cpu_code_cache_private.h" />
<ClInclude Include="cpu_core.h" />
<ClInclude Include="cpu_core_private.h" />
<ClInclude Include="cpu_disasm.h" />
@ -176,6 +177,9 @@
<ProjectReference Include="..\..\dep\zstd\zstd.vcxproj">
<Project>{73ee0c55-6ffe-44e7-9c12-baa52434a797}</Project>
</ProjectReference>
<ProjectReference Include="..\..\dep\zydis\zydis.vcxproj">
<Project>{c51a346a-86b2-46df-9bb3-d0aa7e5d8699}</Project>
</ProjectReference>
<ProjectReference Include="..\scmversion\scmversion.vcxproj">
<Project>{075ced82-6a20-46df-94c7-9624ac9ddbeb}</Project>
</ProjectReference>

View file

@ -124,5 +124,6 @@
<ClInclude Include="shader_cache_version.h" />
<ClInclude Include="gpu_shadergen.h" />
<ClInclude Include="pch.h" />
<ClInclude Include="cpu_code_cache_private.h" />
</ItemGroup>
</Project>

File diff suppressed because it is too large Load diff

View file

@ -1,160 +1,42 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
#include "bus.h"
#include "common/bitfield.h"
#include "cpu_types.h"
#include "util/jit_code_buffer.h"
#include "util/page_fault_handler.h"
#include <array>
#include <map>
#include <memory>
#include <unordered_map>
#include <vector>
#ifdef ENABLE_RECOMPILER
#include "cpu_recompiler_types.h"
#endif
namespace CPU::CodeCache {
namespace CPU {
/// Returns true if any recompiler is in use.
bool IsUsingAnyRecompiler();
union CodeBlockKey
{
u32 bits;
/// Returns true if any recompiler and fastmem is in use.
bool IsUsingFastmem();
BitField<u32, bool, 0, 1> user_mode;
BitField<u32, u32, 2, 30> aligned_pc;
/// Allocates resources, call once at startup.
void ProcessStartup();
ALWAYS_INLINE u32 GetPC() const { return aligned_pc << 2; }
ALWAYS_INLINE void SetPC(u32 pc) { aligned_pc = pc >> 2; }
ALWAYS_INLINE u32 GetPCPhysicalAddress() const { return (aligned_pc << 2) & PHYSICAL_MEMORY_ADDRESS_MASK; }
ALWAYS_INLINE CodeBlockKey() = default;
ALWAYS_INLINE CodeBlockKey(const CodeBlockKey& rhs) : bits(rhs.bits) {}
ALWAYS_INLINE CodeBlockKey& operator=(const CodeBlockKey& rhs)
{
bits = rhs.bits;
return *this;
}
ALWAYS_INLINE bool operator==(const CodeBlockKey& rhs) const { return bits == rhs.bits; }
ALWAYS_INLINE bool operator!=(const CodeBlockKey& rhs) const { return bits != rhs.bits; }
ALWAYS_INLINE bool operator<(const CodeBlockKey& rhs) const { return bits < rhs.bits; }
};
struct CodeBlockInstruction
{
Instruction instruction;
u32 pc;
bool is_branch_instruction : 1;
bool is_direct_branch_instruction : 1;
bool is_unconditional_branch_instruction : 1;
bool is_branch_delay_slot : 1;
bool is_load_instruction : 1;
bool is_store_instruction : 1;
bool is_load_delay_slot : 1;
bool is_last_instruction : 1;
bool has_load_delay : 1;
bool can_trap : 1;
};
struct CodeBlock
{
using HostCodePointer = void (*)();
struct LinkInfo
{
CodeBlock* block;
void* host_pc;
void* host_resolve_pc;
u32 host_pc_size;
};
CodeBlock(const CodeBlockKey key_) : key(key_) {}
CodeBlockKey key;
u32 host_code_size = 0;
HostCodePointer host_code = nullptr;
std::vector<CodeBlockInstruction> instructions;
std::vector<LinkInfo> link_predecessors;
std::vector<LinkInfo> link_successors;
TickCount uncached_fetch_ticks = 0;
u32 icache_line_count = 0;
#ifdef ENABLE_RECOMPILER
std::vector<Recompiler::LoadStoreBackpatchInfo> loadstore_backpatch_info;
#endif
bool contains_loadstore_instructions = false;
bool contains_double_branches = false;
bool invalidated = false;
bool can_link = true;
u32 recompile_frame_number = 0;
u32 recompile_count = 0;
u32 invalidate_frame_number = 0;
u32 GetPC() const { return key.GetPC(); }
u32 GetSizeInBytes() const { return static_cast<u32>(instructions.size()) * sizeof(Instruction); }
u32 GetStartPageIndex() const { return (key.GetPCPhysicalAddress() / HOST_PAGE_SIZE); }
u32 GetEndPageIndex() const { return ((key.GetPCPhysicalAddress() + GetSizeInBytes()) / HOST_PAGE_SIZE); }
bool IsInRAM() const
{
// TODO: Constant
return key.GetPCPhysicalAddress() < 0x200000;
}
};
namespace CodeCache {
enum : u32
{
FAST_MAP_TABLE_COUNT = 0x10000,
FAST_MAP_TABLE_SIZE = 0x10000 / 4, // 16384
FAST_MAP_TABLE_SHIFT = 16,
};
using FastMapTable = CodeBlock::HostCodePointer*;
/// Frees resources, call once at shutdown.
void ProcessShutdown();
/// Initializes resources for the system.
void Initialize();
/// Frees resources used by the system.
void Shutdown();
/// Runs the system.
[[noreturn]] void Execute();
#ifdef ENABLE_RECOMPILER
using DispatcherFunction = void (*)();
using SingleBlockDispatcherFunction = void (*)(const CodeBlock::HostCodePointer);
FastMapTable* GetFastMapPointer();
#endif
#if defined(ENABLE_RECOMPILER)
JitCodeBuffer& GetCodeBuffer();
#endif
/// Flushes the code cache, forcing all blocks to be recompiled.
void Flush();
/// Changes whether the recompiler is enabled.
void Reinitialize();
void Reset();
/// Invalidates all blocks which are in the range of the specified code page.
void InvalidateBlocksWithPageIndex(u32 page_index);
/// Invalidates all blocks in the cache.
void InvalidateAll();
template<PGXPMode pgxp_mode>
void InterpretCachedBlock(const CodeBlock& block);
template<PGXPMode pgxp_mode>
void InterpretUncachedBlock();
void InvalidateAllRAMBlocks();
/// Invalidates any code pages which overlap the specified range.
ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count)
@ -168,6 +50,4 @@ ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_c
}
}
}; // namespace CodeCache
} // namespace CPU
} // namespace CPU::CodeCache

View file

@ -0,0 +1,279 @@
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
#include "bus.h"
#include "common/bitfield.h"
#include "common/perf_scope.h"
#include "cpu_code_cache.h"
#include "cpu_core_private.h"
#include "cpu_types.h"
#include "util/jit_code_buffer.h"
#include "util/page_fault_handler.h"
#include <array>
#include <map>
#include <memory>
#include <unordered_map>
#include <vector>
#ifdef ENABLE_RECOMPILER
// #include "cpu_recompiler_types.h"
#endif
namespace CPU::CodeCache {
enum : u32
{
LUT_TABLE_COUNT = 0x10000,
LUT_TABLE_SIZE = 0x10000 / sizeof(u32), // 16384, one for each PC
LUT_TABLE_SHIFT = 16,
MAX_BLOCK_EXIT_LINKS = 2,
};
using CodeLUT = const void**;
using CodeLUTArray = std::array<CodeLUT, LUT_TABLE_COUNT>;
using BlockLinkMap = std::unordered_multimap<u32, void*>; // TODO: try ordered?
enum RegInfoFlags : u8
{
RI_LIVE = (1 << 0),
RI_USED = (1 << 1),
RI_LASTUSE = (1 << 2),
};
struct InstructionInfo
{
u32 pc; // TODO: Remove this, old recs still depend on it.
bool is_branch_instruction : 1;
bool is_direct_branch_instruction : 1;
bool is_unconditional_branch_instruction : 1;
bool is_branch_delay_slot : 1;
bool is_load_instruction : 1;
bool is_store_instruction : 1;
bool is_load_delay_slot : 1;
bool is_last_instruction : 1;
bool has_load_delay : 1;
bool can_trap : 1;
u8 reg_flags[static_cast<u8>(Reg::count)];
// Reg write_reg[3];
Reg read_reg[3];
// If unset, values which are not live will not be written back to memory.
// Tends to break stuff at the moment.
static constexpr bool WRITE_DEAD_VALUES = true;
/// Returns true if the register is used later in the block, and this isn't the last instruction to use it.
/// In other words, the register is worth keeping in a host register/caching it.
inline bool UsedTest(Reg reg) const { return (reg_flags[static_cast<u8>(reg)] & (RI_USED | RI_LASTUSE)) == RI_USED; }
/// Returns true if the value should be computed/written back.
/// Basically, this means it's either used before it's overwritten, or not overwritten by the end of the block.
inline bool LiveTest(Reg reg) const
{
return WRITE_DEAD_VALUES || ((reg_flags[static_cast<u8>(reg)] & RI_LIVE) != 0);
}
/// Returns true if the register can be renamed into another.
inline bool RenameTest(Reg reg) const { return (reg == Reg::zero || !UsedTest(reg) || !LiveTest(reg)); }
/// Returns true if this instruction reads this register.
inline bool ReadsReg(Reg reg) const { return (read_reg[0] == reg || read_reg[1] == reg || read_reg[2] == reg); }
};
enum class BlockState : u8
{
Valid,
Invalidated,
NeedsRecompile,
FallbackToInterpreter
};
enum class BlockFlags : u8
{
None = 0,
ContainsLoadStoreInstructions = (1 << 0),
SpansPages = (1 << 1),
BranchDelaySpansPages = (1 << 2),
};
IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(BlockFlags);
enum class PageProtectionMode : u8
{
WriteProtected,
ManualCheck,
Unprotected,
};
struct BlockMetadata
{
TickCount uncached_fetch_ticks;
u32 icache_line_count;
BlockFlags flags;
};
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4324) // C4324: 'CPU::CodeCache::Block': structure was padded due to alignment specifier)
#endif
struct alignas(16) Block
{
u32 pc;
u32 size; // in guest instructions
const void* host_code;
// links to previous/next block within page
Block* next_block_in_page;
BlockLinkMap::iterator exit_links[MAX_BLOCK_EXIT_LINKS];
u8 num_exit_links;
// TODO: Move up so it's part of the same cache line
BlockState state;
BlockFlags flags;
PageProtectionMode protection;
TickCount uncached_fetch_ticks;
u32 icache_line_count;
u32 compile_frame;
u8 compile_count;
// followed by Instruction * size, InstructionRegInfo * size
ALWAYS_INLINE const Instruction* Instructions() const { return reinterpret_cast<const Instruction*>(this + 1); }
ALWAYS_INLINE Instruction* Instructions() { return reinterpret_cast<Instruction*>(this + 1); }
ALWAYS_INLINE const InstructionInfo* InstructionsInfo() const
{
return reinterpret_cast<const InstructionInfo*>(Instructions() + size);
}
ALWAYS_INLINE InstructionInfo* InstructionsInfo()
{
return reinterpret_cast<InstructionInfo*>(Instructions() + size);
}
// returns true if the block has a given flag
ALWAYS_INLINE bool HasFlag(BlockFlags flag) const { return ((flags & flag) != BlockFlags::None); }
// returns the page index for the start of the block
ALWAYS_INLINE u32 StartPageIndex() const { return Bus::GetRAMCodePageIndex(pc); }
// returns the page index for the last instruction in the block (inclusive)
ALWAYS_INLINE u32 EndPageIndex() const { return Bus::GetRAMCodePageIndex(pc + ((size - 1) * sizeof(Instruction))); }
// returns true if the block spans multiple pages
ALWAYS_INLINE bool SpansPages() const { return StartPageIndex() != EndPageIndex(); }
};
#ifdef _MSC_VER
#pragma warning(pop)
#endif
using BlockLUTArray = std::array<Block**, LUT_TABLE_COUNT>;
struct LoadstoreBackpatchInfo
{
union
{
struct
{
u32 gpr_bitmask;
u16 cycles;
u16 address_register : 5;
u16 data_register : 5;
u16 size : 2;
u16 is_signed : 1;
u16 is_load : 1;
};
const void* thunk_address; // only needed for oldrec
};
u32 guest_pc;
u8 code_size;
MemoryAccessSize AccessSize() const { return static_cast<MemoryAccessSize>(size); }
u32 AccessSizeInBytes() const { return 1u << size; }
};
static_assert(sizeof(LoadstoreBackpatchInfo) == 16);
static inline bool AddressInRAM(VirtualMemoryAddress pc)
{
return VirtualAddressToPhysical(pc) < Bus::g_ram_size;
}
struct PageProtectionInfo
{
Block* first_block_in_page;
Block* last_block_in_page;
PageProtectionMode mode;
u16 invalidate_count;
u32 invalidate_frame;
};
static_assert(sizeof(PageProtectionInfo) == (sizeof(Block*) * 2 + 8));
template<PGXPMode pgxp_mode>
void InterpretCachedBlock(const Block* block);
template<PGXPMode pgxp_mode>
void InterpretUncachedBlock();
void LogCurrentState();
#if defined(ENABLE_RECOMPILER)
#define ENABLE_RECOMPILER_SUPPORT 1
#if defined(_DEBUG) || false
// Enable disassembly of host assembly code.
#define ENABLE_HOST_DISASSEMBLY 1
#endif
#if false
// Enable profiling of JIT blocks.
#define ENABLE_RECOMPILER_PROFILING 1
#endif
JitCodeBuffer& GetCodeBuffer();
const void* GetInterpretUncachedBlockFunction();
void CompileOrRevalidateBlock(u32 start_pc);
void DiscardAndRecompileBlock(u32 start_pc);
const void* CreateBlockLink(Block* from_block, void* code, u32 newpc);
void AddLoadStoreInfo(void* code_address, u32 code_size, u32 guest_pc, const void* thunk_address);
void AddLoadStoreInfo(void* code_address, u32 code_size, u32 guest_pc, TickCount cycles, u32 gpr_bitmask,
u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed, bool is_load);
u32 EmitASMFunctions(void* code, u32 code_size);
u32 EmitJump(void* code, const void* dst, bool flush_icache);
void DisassembleAndLogHostCode(const void* start, u32 size);
u32 GetHostInstructionCount(const void* start, u32 size);
extern CodeLUTArray g_code_lut;
extern NORETURN_FUNCTION_POINTER void (*g_enter_recompiler)();
extern const void* g_compile_or_revalidate_block;
extern const void* g_check_events_and_dispatch;
extern const void* g_run_events_and_dispatch;
extern const void* g_dispatcher;
extern const void* g_block_dispatcher;
extern const void* g_interpret_block;
extern const void* g_discard_and_recompile_block;
#ifdef ENABLE_RECOMPILER_PROFILING
extern PerfScope MIPSPerfScope;
#endif // ENABLE_RECOMPILER_PROFILING
#endif // ENABLE_RECOMPILER
} // namespace CPU::CodeCache

View file

@ -7,6 +7,7 @@
#include "common/fastjmp.h"
#include "common/file_system.h"
#include "common/log.h"
#include "cpu_code_cache_private.h"
#include "cpu_core_private.h"
#include "cpu_disasm.h"
#include "cpu_recompiler_thunks.h"
@ -2262,20 +2263,24 @@ void CPU::SingleStep()
}
template<PGXPMode pgxp_mode>
void CPU::CodeCache::InterpretCachedBlock(const CodeBlock& block)
void CPU::CodeCache::InterpretCachedBlock(const Block* block)
{
// set up the state so we've already fetched the instruction
DebugAssert(g_state.pc == block.GetPC());
g_state.npc = block.GetPC() + 4;
DebugAssert(g_state.pc == block->pc);
g_state.npc = block->pc + 4;
for (const CodeBlockInstruction& cbi : block.instructions)
const Instruction* instruction = block->Instructions();
const Instruction* end_instruction = instruction + block->size;
const CodeCache::InstructionInfo* info = block->InstructionsInfo();
do
{
g_state.pending_ticks++;
// now executing the instruction we previously fetched
g_state.current_instruction.bits = cbi.instruction.bits;
g_state.current_instruction_pc = cbi.pc;
g_state.current_instruction_in_branch_delay_slot = cbi.is_branch_delay_slot;
g_state.current_instruction.bits = instruction->bits;
g_state.current_instruction_pc = info->pc;
g_state.current_instruction_in_branch_delay_slot = info->is_branch_delay_slot; // TODO: let int set it instead
g_state.current_instruction_was_branch_taken = g_state.branch_was_taken;
g_state.branch_was_taken = false;
g_state.exception_raised = false;
@ -2292,15 +2297,18 @@ void CPU::CodeCache::InterpretCachedBlock(const CodeBlock& block)
if (g_state.exception_raised)
break;
}
instruction++;
info++;
} while (instruction != end_instruction);
// cleanup so the interpreter can kick in if needed
g_state.next_instruction_is_branch_delay_slot = false;
}
template void CPU::CodeCache::InterpretCachedBlock<PGXPMode::Disabled>(const CodeBlock& block);
template void CPU::CodeCache::InterpretCachedBlock<PGXPMode::Memory>(const CodeBlock& block);
template void CPU::CodeCache::InterpretCachedBlock<PGXPMode::CPU>(const CodeBlock& block);
template void CPU::CodeCache::InterpretCachedBlock<PGXPMode::Disabled>(const Block* block);
template void CPU::CodeCache::InterpretCachedBlock<PGXPMode::Memory>(const Block* block);
template void CPU::CodeCache::InterpretCachedBlock<PGXPMode::CPU>(const Block* block);
template<PGXPMode pgxp_mode>
void CPU::CodeCache::InterpretUncachedBlock()
@ -2989,6 +2997,8 @@ static void MemoryBreakpoint(MemoryAccessType type, MemoryAccessSize size, Virtu
static constexpr const char* types[2] = { "read", "write" };
const u32 cycle = TimingEvents::GetGlobalTickCounter() + CPU::g_state.pending_ticks;
if (cycle == 3301006373)
__debugbreak();
#if 0
static std::FILE* fp = nullptr;

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@
#include "util/jit_code_buffer.h"
#include "cpu_code_cache.h"
#include "cpu_code_cache_private.h"
#include "cpu_recompiler_register_cache.h"
#include "cpu_recompiler_thunks.h"
#include "cpu_recompiler_types.h"
@ -17,34 +17,56 @@
namespace CPU::Recompiler {
enum class Condition : u8
{
Always,
NotEqual,
Equal,
Overflow,
Greater,
GreaterEqual,
LessEqual,
Less,
Negative,
PositiveOrZero,
Above, // unsigned variant of Greater
AboveEqual, // unsigned variant of GreaterEqual
Below, // unsigned variant of Less
BelowEqual, // unsigned variant of LessEqual
NotZero,
Zero
};
class CodeGenerator
{
public:
using SpeculativeValue = std::optional<u32>;
struct CodeBlockInstruction
{
const Instruction* instruction;
const CodeCache::InstructionInfo* info;
};
CodeGenerator(JitCodeBuffer* code_buffer);
~CodeGenerator();
static const char* GetHostRegName(HostReg reg, RegSize size = HostPointerSize);
static void AlignCodeBuffer(JitCodeBuffer* code_buffer);
static bool BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi);
static void BackpatchBranch(void* pc, u32 pc_size, void* target);
static void BackpatchReturn(void* pc, u32 pc_size);
static void BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi);
bool CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size);
CodeCache::DispatcherFunction CompileDispatcher();
CodeCache::SingleBlockDispatcherFunction CompileSingleBlockDispatcher();
const void* CompileBlock(CodeCache::Block* block, u32* out_host_code_size, u32* out_host_far_code_size);
//////////////////////////////////////////////////////////////////////////
// Code Generation
//////////////////////////////////////////////////////////////////////////
void EmitBeginBlock(bool allocate_registers = true);
void EmitEndBlock(bool free_registers = true, bool emit_return = true);
void EmitEndBlock(bool free_registers, const void* jump_to);
void EmitExceptionExit();
void EmitExceptionExitOnBool(const Value& value);
void FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size);
const void* FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size);
void EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size);
void EmitZeroExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size);
@ -77,6 +99,7 @@ public:
void EmitMoveNextInterpreterLoadDelay();
void EmitCancelInterpreterLoadDelayForReg(Reg reg);
void EmitICacheCheckAndUpdate();
void EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size);
void EmitStallUntilGTEComplete();
void EmitLoadCPUStructField(HostReg host_reg, RegSize size, u32 offset);
void EmitStoreCPUStructField(u32 offset, const Value& value);
@ -88,18 +111,19 @@ public:
// Automatically generates an exception handler.
Value GetFastmemLoadBase();
Value GetFastmemStoreBase();
Value EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const SpeculativeValue& address_spec,
RegSize size);
Value EmitLoadGuestMemory(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address,
const SpeculativeValue& address_spec, RegSize size);
void EmitLoadGuestRAMFastmem(const Value& address, RegSize size, Value& result);
void EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result);
void EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result,
bool in_far_code);
void EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const SpeculativeValue& address_spec,
RegSize size, const Value& value);
void EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
const Value& value);
void EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
const Value& value, bool in_far_code);
void EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address,
RegSize size, Value& result);
void EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address,
RegSize size, Value& result, bool in_far_code);
void EmitStoreGuestMemory(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address,
const SpeculativeValue& address_spec, RegSize size, const Value& value);
void EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value);
void EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value, bool in_far_code);
void EmitUpdateFastmemBase();
// Unconditional branch to pointer. May allocate a scratch register.
@ -179,7 +203,7 @@ public:
Value NotValue(const Value& val);
// Raising exception if condition is true.
void GenerateExceptionExit(const CodeBlockInstruction& cbi, Exception excode,
void GenerateExceptionExit(Instruction instruction, const CodeCache::InstructionInfo& info, Exception excode,
Condition condition = Condition::Always);
private:
@ -194,6 +218,7 @@ private:
void SwitchToFarCode();
void SwitchToNearCode();
void* GetStartNearCodePointer() const;
void* GetCurrentCodePointer() const;
void* GetCurrentNearCodePointer() const;
void* GetCurrentFarCodePointer() const;
@ -204,8 +229,9 @@ private:
// branch target, memory address, etc
void BlockPrologue();
void BlockEpilogue();
void InstructionPrologue(const CodeBlockInstruction& cbi, TickCount cycles, bool force_sync = false);
void InstructionEpilogue(const CodeBlockInstruction& cbi);
void InstructionPrologue(Instruction instruction, const CodeCache::InstructionInfo& info, TickCount cycles,
bool force_sync = false);
void InstructionEpilogue(Instruction instruction, const CodeCache::InstructionInfo& info);
void TruncateBlockAtCurrentInstruction();
void AddPendingCycles(bool commit);
void AddGTETicks(TickCount ticks);
@ -221,32 +247,33 @@ private:
//////////////////////////////////////////////////////////////////////////
// Instruction Code Generators
//////////////////////////////////////////////////////////////////////////
bool CompileInstruction(const CodeBlockInstruction& cbi);
bool Compile_Fallback(const CodeBlockInstruction& cbi);
bool Compile_Nop(const CodeBlockInstruction& cbi);
bool Compile_Bitwise(const CodeBlockInstruction& cbi);
bool Compile_Shift(const CodeBlockInstruction& cbi);
bool Compile_Load(const CodeBlockInstruction& cbi);
bool Compile_Store(const CodeBlockInstruction& cbi);
bool Compile_LoadLeftRight(const CodeBlockInstruction& cbi);
bool Compile_StoreLeftRight(const CodeBlockInstruction& cbi);
bool Compile_MoveHiLo(const CodeBlockInstruction& cbi);
bool Compile_Add(const CodeBlockInstruction& cbi);
bool Compile_Subtract(const CodeBlockInstruction& cbi);
bool Compile_Multiply(const CodeBlockInstruction& cbi);
bool Compile_Divide(const CodeBlockInstruction& cbi);
bool Compile_SignedDivide(const CodeBlockInstruction& cbi);
bool Compile_SetLess(const CodeBlockInstruction& cbi);
bool Compile_Branch(const CodeBlockInstruction& cbi);
bool Compile_lui(const CodeBlockInstruction& cbi);
bool Compile_cop0(const CodeBlockInstruction& cbi);
bool Compile_cop2(const CodeBlockInstruction& cbi);
bool CompileInstruction(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Fallback(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Nop(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Bitwise(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Shift(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Load(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Store(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_LoadLeftRight(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_StoreLeftRight(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_MoveHiLo(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Add(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Subtract(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Multiply(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Divide(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_SignedDivide(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_SetLess(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_Branch(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_lui(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_cop0(Instruction instruction, const CodeCache::InstructionInfo& info);
bool Compile_cop2(Instruction instruction, const CodeCache::InstructionInfo& info);
JitCodeBuffer* m_code_buffer;
CodeBlock* m_block = nullptr;
const CodeBlockInstruction* m_block_start = nullptr;
const CodeBlockInstruction* m_block_end = nullptr;
const CodeBlockInstruction* m_current_instruction = nullptr;
CodeCache::Block* m_block = nullptr;
CodeBlockInstruction m_block_start = {};
CodeBlockInstruction m_block_end = {};
CodeBlockInstruction m_current_instruction = {};
RegisterCache m_register_cache;
CodeEmitter m_near_emitter;
CodeEmitter m_far_emitter;
@ -267,9 +294,6 @@ private:
bool m_next_load_delay_dirty = false;
bool m_gte_busy_cycles_dirty = false;
bool m_fastmem_load_base_in_register = false;
bool m_fastmem_store_base_in_register = false;
//////////////////////////////////////////////////////////////////////////
// Speculative Constants
//////////////////////////////////////////////////////////////////////////

View file

@ -1,9 +1,11 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "cpu_code_cache_private.h"
#include "cpu_core.h"
#include "cpu_core_private.h"
#include "cpu_recompiler_code_generator.h"
@ -12,38 +14,230 @@
#include "timing_event.h"
Log_SetChannel(CPU::Recompiler);
// #include "vixl/aarch32/disasm-aarch32.h"
// #include <iostream>
#ifdef ENABLE_HOST_DISASSEMBLY
#include "vixl/aarch32/disasm-aarch32.h"
#include <iostream>
#endif
namespace a32 = vixl::aarch32;
namespace CPU::Recompiler {
constexpr HostReg RCPUPTR = 4;
constexpr HostReg RRETURN = 0;
constexpr HostReg RARG1 = 0;
constexpr HostReg RARG2 = 1;
constexpr HostReg RARG3 = 2;
constexpr HostReg RARG4 = 3;
constexpr HostReg RSCRATCH = 12;
constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32;
constexpr u32 FUNCTION_CALLEE_SAVED_SPACE_RESERVE = 80; // 8 registers
constexpr u32 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 bytes
constexpr u32 FUNCTION_STACK_SIZE =
FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE + FUNCTION_CALL_SHADOW_SPACE;
constexpr u32 FUNCTION_STACK_SIZE = FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE;
} // namespace CPU::Recompiler
static s32 GetPCDisplacement(const void* current, const void* target)
s32 CPU::Recompiler::armGetPCDisplacement(const void* current, const void* target)
{
Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
return static_cast<s32>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)));
}
static bool IsPCDisplacementInImmediateRange(s32 displacement)
bool CPU::Recompiler::armIsPCDisplacementInImmediateRange(s32 displacement)
{
return (displacement >= -33554432 && displacement <= 33554428);
}
void CPU::Recompiler::armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm)
{
if (vixl::IsUintN(16, imm))
{
armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);
return;
}
armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);
armAsm->movt(vixl::aarch32::al, rd, imm >> 16);
}
void CPU::Recompiler::armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
const void* addr)
{
armEmitMov(armAsm, reg, static_cast<u32>(reinterpret_cast<uintptr_t>(addr)));
}
void CPU::Recompiler::armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)
{
// TODO: pooling
const s32 displacement = armGetPCDisplacement(armAsm->GetCursorAddress<const void*>(), ptr);
if (!armIsPCDisplacementInImmediateRange(displacement))
{
armMoveAddressToReg(armAsm, RSCRATCH, ptr);
armAsm->bx(RSCRATCH);
}
else
{
a32::Label label(displacement + armAsm->GetCursorOffset());
armAsm->b(&label);
}
}
void CPU::Recompiler::armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)
{
// TODO: pooling
const s32 displacement = armGetPCDisplacement(armAsm->GetCursorAddress<const void*>(), ptr);
if (!armIsPCDisplacementInImmediateRange(displacement))
{
armMoveAddressToReg(armAsm, RSCRATCH, ptr);
armAsm->blx(RSCRATCH);
}
else
{
a32::Label label(displacement + armAsm->GetCursorOffset());
armAsm->bl(&label);
}
}
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
{
#ifdef ENABLE_HOST_DISASSEMBLY
a32::PrintDisassembler dis(std::cout, 0);
dis.SetCodeAddress(reinterpret_cast<uintptr_t>(start));
dis.DisassembleA32Buffer(static_cast<const u32*>(start), size);
#else
Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY.");
#endif
}
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
{
return size / a32::kA32InstructionSizeInBytes;
}
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
{
using namespace vixl::aarch32;
using namespace CPU::Recompiler;
const s32 disp = armGetPCDisplacement(code, dst);
DebugAssert(armIsPCDisplacementInImmediateRange(disp));
// A32 jumps are silly.
{
vixl::aarch32::Assembler emit(static_cast<vixl::byte*>(code), kA32InstructionSizeInBytes, a32::A32);
a32::Label label(disp);
emit.b(&label);
}
if (flush_icache)
JitCodeBuffer::FlushInstructionCache(code, kA32InstructionSizeInBytes);
return kA32InstructionSizeInBytes;
}
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
{
using namespace vixl::aarch32;
using namespace CPU::Recompiler;
#define PTR(x) a32::MemOperand(RSTATE, (s32)(((u8*)(x)) - ((u8*)&g_state)))
Assembler actual_asm(static_cast<u8*>(code), code_size);
Assembler* armAsm = &actual_asm;
#ifdef VIXL_DEBUG
vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
#endif
Label dispatch;
g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();
{
// reserve some space for saving caller-saved registers
armAsm->sub(sp, sp, FUNCTION_STACK_SIZE);
// Need the CPU state for basically everything :-)
armMoveAddressToReg(armAsm, RSTATE, &g_state);
}
// check events then for frame done
g_check_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
{
Label skip_event_check;
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
armAsm->ldr(RARG2, PTR(&g_state.downcount));
armAsm->cmp(RARG1, RARG2);
armAsm->b(lt, &skip_event_check);
g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
armAsm->bind(&skip_event_check);
}
// TODO: align?
g_dispatcher = armAsm->GetCursorAddress<const void*>();
{
armAsm->bind(&dispatch);
// x9 <- s_fast_map[pc >> 16]
armAsm->ldr(RARG1, PTR(&g_state.pc));
armMoveAddressToReg(armAsm, RARG3, g_code_lut.data());
armAsm->lsr(RARG2, RARG1, 16);
armAsm->ldr(RARG2, MemOperand(RARG3, RARG2, LSL, 2));
// blr(x9[pc * 2]) (fast_map[pc >> 2])
armAsm->ldr(RARG1, MemOperand(RARG2, RARG1));
armAsm->blx(RARG1);
}
g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
{
armAsm->ldr(RARG1, PTR(&g_state.pc));
armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);
armAsm->b(&dispatch);
}
g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
{
armAsm->ldr(RARG1, PTR(&g_state.pc));
armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);
armAsm->b(&dispatch);
}
g_interpret_block = armAsm->GetCursorAddress<const void*>();
{
armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
armAsm->b(&dispatch);
}
armAsm->FinalizeCode();
#if 0
// TODO: align?
s_trampoline_targets.clear();
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
s_trampoline_used = 0;
#endif
#undef PTR
return static_cast<u32>(armAsm->GetCursorOffset()) /* + TRAMPOLINE_AREA_SIZE*/;
}
// Macros aren't used with old-rec.
#undef RRET
#undef RARG1
#undef RARG2
#undef RARG3
#undef RARG4
#undef RSCRATCH
#undef RMEMBASE
#undef RSTATE
namespace CPU::Recompiler {
constexpr HostReg RCPUPTR = 4;
constexpr HostReg RMEMBASEPTR = 5;
constexpr HostReg RRETURN = 0;
constexpr HostReg RARG1 = 0;
constexpr HostReg RARG2 = 1;
constexpr HostReg RARG3 = 2;
constexpr HostReg RARG4 = 3;
constexpr HostReg RSCRATCH = 12;
static const a32::Register GetHostReg8(HostReg reg)
{
return a32::Register(reg);
@ -82,6 +276,11 @@ static const a32::Register GetCPUPtrReg()
return GetHostReg32(RCPUPTR);
}
static const a32::Register GetFastmemBasePtrReg()
{
return GetHostReg32(RMEMBASEPTR);
}
CodeGenerator::CodeGenerator(JitCodeBuffer* code_buffer)
: m_code_buffer(code_buffer), m_register_cache(*this),
m_near_emitter(static_cast<vixl::byte*>(code_buffer->GetFreeCodePointer()), code_buffer->GetFreeCodeSpace(),
@ -136,6 +335,11 @@ void CodeGenerator::SwitchToNearCode()
m_emit = &m_near_emitter;
}
void* CodeGenerator::GetStartNearCodePointer() const
{
return static_cast<u8*>(m_code_buffer->GetFreeCodePointer());
}
void* CodeGenerator::GetCurrentNearCodePointer() const
{
return static_cast<u8*>(m_code_buffer->GetFreeCodePointer()) + m_near_emitter.GetCursorOffset();
@ -168,8 +372,6 @@ Value CodeGenerator::GetValueInHostOrScratchRegister(const Value& value, bool al
void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
{
m_emit->sub(a32::sp, a32::sp, FUNCTION_STACK_SIZE);
if (allocate_registers)
{
// Save the link register, since we'll be calling functions.
@ -183,22 +385,31 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
// m_emit->Mov(GetCPUPtrReg(), reinterpret_cast<uintptr_t>(&g_state));
DebugAssert(cpu_reg_allocated);
UNREFERENCED_VARIABLE(cpu_reg_allocated);
// If there's loadstore instructions, preload the fastmem base.
if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions))
{
const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR);
Assert(fastmem_reg_allocated);
m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base)));
}
}
}
void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */)
void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, const void* jump_to)
{
if (free_registers)
{
if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions))
m_register_cache.FreeHostReg(RMEMBASEPTR);
m_register_cache.FreeHostReg(RCPUPTR);
m_register_cache.FreeHostReg(14);
m_register_cache.PopCalleeSavedRegisters(true);
}
m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE);
if (emit_return)
m_emit->bx(a32::lr);
if (jump_to)
armEmitJmp(m_emit, jump_to, true);
}
void CodeGenerator::EmitExceptionExit()
@ -212,8 +423,7 @@ void CodeGenerator::EmitExceptionExit()
m_register_cache.PopCalleeSavedRegisters(false);
m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE);
m_emit->bx(a32::lr);
armEmitJmp(m_emit, CodeCache::g_check_events_and_dispatch, true);
}
void CodeGenerator::EmitExceptionExitOnBool(const Value& value)
@ -236,13 +446,14 @@ void CodeGenerator::EmitExceptionExitOnBool(const Value& value)
m_register_cache.PopState();
}
void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size)
const void* CodeGenerator::FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size)
{
m_near_emitter.FinalizeCode();
m_far_emitter.FinalizeCode();
*out_host_code = reinterpret_cast<CodeBlock::HostCodePointer>(m_code_buffer->GetFreeCodePointer());
const void* code = m_code_buffer->GetFreeCodePointer();
*out_host_code_size = static_cast<u32>(m_near_emitter.GetSizeOfCodeGenerated());
*out_host_far_code_size = static_cast<u32>(m_far_emitter.GetSizeOfCodeGenerated());
m_code_buffer->CommitCode(static_cast<u32>(m_near_emitter.GetSizeOfCodeGenerated()));
m_code_buffer->CommitFarCode(static_cast<u32>(m_far_emitter.GetSizeOfCodeGenerated()));
@ -252,11 +463,7 @@ void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32
m_far_emitter = CodeEmitter(static_cast<vixl::byte*>(m_code_buffer->GetFreeFarCodePointer()),
m_code_buffer->GetFreeFarCodeSpace(), a32::A32);
#if 0
a32::PrintDisassembler dis(std::cout, 0);
dis.SetCodeAddress(reinterpret_cast<uintptr_t>(*out_host_code));
dis.DisassembleA32Buffer(reinterpret_cast<u32*>(*out_host_code), *out_host_code_size);
#endif
return code;
}
void CodeGenerator::EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size)
@ -847,8 +1054,6 @@ void CodeGenerator::EmitSetConditionResult(HostReg to_reg, RegSize to_size, Cond
u32 CodeGenerator::PrepareStackForCall()
{
m_fastmem_load_base_in_register = false;
m_fastmem_store_base_in_register = false;
m_register_cache.PushCallerSavedRegisters();
return 0;
}
@ -860,17 +1065,7 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size)
void CodeGenerator::EmitCall(const void* ptr)
{
const s32 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr);
if (!IsPCDisplacementInImmediateRange(displacement))
{
m_emit->Mov(GetHostReg32(RSCRATCH), reinterpret_cast<uintptr_t>(ptr));
m_emit->blx(GetHostReg32(RSCRATCH));
}
else
{
a32::Label label(displacement + m_emit->GetCursorOffset());
m_emit->bl(&label);
}
armEmitCall(m_emit, ptr, false);
}
void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr)
@ -1005,7 +1200,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co
void CodeGenerator::EmitPushHostReg(HostReg reg, u32 position)
{
const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 4));
const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - (position * 4));
m_emit->str(GetHostReg32(reg), addr);
}
@ -1018,7 +1213,7 @@ void CodeGenerator::EmitPushHostRegPair(HostReg reg, HostReg reg2, u32 position)
void CodeGenerator::EmitPopHostReg(HostReg reg, u32 position)
{
const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 4));
const a32::MemOperand addr(a32::sp, FUNCTION_STACK_SIZE - (position * 4));
m_emit->ldr(GetHostReg32(reg), addr);
}
@ -1153,51 +1348,13 @@ void CodeGenerator::EmitAddCPUStructField(u32 offset, const Value& value)
}
}
Value CodeGenerator::GetFastmemLoadBase()
{
Value val = Value::FromHostReg(&m_register_cache, RARG4, RegSize_32);
if (!m_fastmem_load_base_in_register)
{
m_emit->ldr(GetHostReg32(val), a32::MemOperand(GetCPUPtrReg(), offsetof(CPU::State, fastmem_base)));
m_fastmem_load_base_in_register = true;
}
return val;
}
Value CodeGenerator::GetFastmemStoreBase()
{
Value val = Value::FromHostReg(&m_register_cache, RARG3, RegSize_32);
if (!m_fastmem_store_base_in_register)
{
m_emit->ldr(GetHostReg32(val), a32::MemOperand(GetCPUPtrReg(), offsetof(CPU::State, fastmem_base)));
m_emit->add(GetHostReg32(val), GetHostReg32(val), sizeof(u32*) * Bus::FASTMEM_LUT_NUM_PAGES);
m_fastmem_store_base_in_register = true;
}
return val;
}
void CodeGenerator::EmitUpdateFastmemBase()
{
if (m_fastmem_load_base_in_register)
{
Value val = Value::FromHostReg(&m_register_cache, RARG4, RegSize_32);
m_emit->ldr(GetHostReg32(val), a32::MemOperand(GetCPUPtrReg(), offsetof(CPU::State, fastmem_base)));
}
if (m_fastmem_store_base_in_register)
{
Value val = Value::FromHostReg(&m_register_cache, RARG3, RegSize_32);
m_emit->ldr(GetHostReg32(val), a32::MemOperand(GetCPUPtrReg(), offsetof(CPU::State, fastmem_base)));
m_emit->add(GetHostReg32(val), GetHostReg32(val), sizeof(u32*) * Bus::FASTMEM_LUT_NUM_PAGES);
}
m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base)));
}
void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, Value& result)
{
Value fastmem_base = GetFastmemLoadBase();
HostReg address_reg;
if (address.IsConstant())
{
@ -1212,7 +1369,7 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size,
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT);
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_MASK);
m_emit->ldr(GetHostReg32(RARG1),
a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
switch (size)
{
@ -1234,18 +1391,9 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size,
}
}
void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
Value& result)
void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, Value& result)
{
// fastmem
LoadStoreBackpatchInfo bpi;
bpi.address_host_reg = HostReg_Invalid;
bpi.value_host_reg = result.host_reg;
bpi.guest_pc = m_current_instruction->pc;
bpi.fault_count = 0;
Value fastmem_base = GetFastmemLoadBase();
HostReg address_reg;
if (address.IsConstant())
{
@ -1258,25 +1406,25 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
}
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT);
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_MASK);
m_emit->ldr(GetHostReg32(RARG1),
a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
m_register_cache.InhibitAllocation();
bpi.host_pc = GetCurrentNearCodePointer();
void* host_pc = GetCurrentNearCodePointer();
switch (size)
{
case RegSize_8:
m_emit->ldrb(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2)));
m_emit->ldrb(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
break;
case RegSize_16:
m_emit->ldrh(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2)));
m_emit->ldrh(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
break;
case RegSize_32:
m_emit->ldr(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2)));
m_emit->ldr(GetHostReg32(result.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
break;
default:
@ -1284,13 +1432,11 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
break;
}
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
const bool old_store_fastmem_base = m_fastmem_store_base_in_register;
const u32 host_code_size =
static_cast<u32>(static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc)));
// generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
const void* host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
@ -1298,27 +1444,22 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// restore fastmem base state for the next instruction
if (old_store_fastmem_base)
fastmem_base = GetFastmemStoreBase();
fastmem_base = GetFastmemLoadBase();
// return to the block code
EmitBranch(GetCurrentNearCodePointer(), false);
SwitchToNearCode();
m_register_cache.UninhibitAllocation();
m_block->loadstore_backpatch_info.push_back(bpi);
CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc);
}
void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
Value& result, bool in_far_code)
void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, Value& result, bool in_far_code)
{
if (g_settings.cpu_recompiler_memory_exceptions)
{
@ -1359,7 +1500,7 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi,
m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2);
EmitOr(result.host_reg, result.host_reg,
Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException(
static_cast<Exception>(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n)));
static_cast<Exception>(0), info.is_branch_delay_slot, false, instruction.cop.cop_n)));
EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
EmitExceptionExit();
@ -1392,16 +1533,9 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi,
}
}
void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
const Value& value)
void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value)
{
LoadStoreBackpatchInfo bpi;
bpi.address_host_reg = HostReg_Invalid;
bpi.value_host_reg = value.host_reg;
bpi.guest_pc = m_current_instruction->pc;
bpi.fault_count = 0;
Value fastmem_base = GetFastmemStoreBase();
Value actual_value = GetValueInHostRegister(value);
HostReg address_reg;
@ -1418,25 +1552,27 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
// TODO: if this gets backpatched, these instructions are wasted
m_emit->lsr(GetHostReg32(RARG1), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_SHIFT);
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_MASK);
m_emit->ldr(GetHostReg32(RARG1),
a32::MemOperand(GetHostReg32(fastmem_base), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
a32::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a32::LSL, 2)); // pointer load
m_register_cache.InhibitAllocation();
bpi.host_pc = GetCurrentNearCodePointer();
void* host_pc = GetCurrentNearCodePointer();
switch (size)
{
case RegSize_8:
m_emit->strb(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2)));
m_emit->strb(GetHostReg32(actual_value.host_reg),
a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
break;
case RegSize_16:
m_emit->strh(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2)));
m_emit->strh(GetHostReg32(actual_value.host_reg),
a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
break;
case RegSize_32:
m_emit->str(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(RARG2)));
m_emit->str(GetHostReg32(actual_value.host_reg), a32::MemOperand(GetHostReg32(RARG1), GetHostReg32(address_reg)));
break;
default:
@ -1444,39 +1580,33 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
break;
}
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
const bool old_load_fastmem_base = m_fastmem_load_base_in_register;
const u32 host_code_size =
static_cast<u32>(static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc)));
// generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
void* host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
EmitStoreGuestMemorySlowmem(cbi, address, size, actual_value, true);
EmitStoreGuestMemorySlowmem(instruction, info, address, size, actual_value, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// restore fastmem base state for the next instruction
if (old_load_fastmem_base)
fastmem_base = GetFastmemLoadBase();
fastmem_base = GetFastmemStoreBase();
// return to the block code
EmitBranch(GetCurrentNearCodePointer(), false);
SwitchToNearCode();
m_register_cache.UninhibitAllocation();
m_block->loadstore_backpatch_info.push_back(bpi);
CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc);
}
void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
const Value& value, bool in_far_code)
void CodeGenerator::EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value,
bool in_far_code)
{
Value value_in_hr = GetValueInHostRegister(value);
@ -1520,7 +1650,7 @@ void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi,
m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2);
EmitOr(result.host_reg, result.host_reg,
Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException(
static_cast<Exception>(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n)));
static_cast<Exception>(0), info.is_branch_delay_slot, false, instruction.cop.cop_n)));
EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
if (!in_far_code)
@ -1552,18 +1682,18 @@ void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi,
}
}
bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi)
void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi)
{
Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem at %p", lbi.host_pc, lbi.guest_pc, lbi.host_slowmem_pc);
Log_DevFmt("Backpatching {} (guest PC 0x{:08X}) to slowmem at {}", host_pc, lbi.guest_pc, lbi.thunk_address);
// turn it into a jump to the slowmem handler
vixl::aarch32::MacroAssembler emit(static_cast<vixl::byte*>(lbi.host_pc), lbi.host_code_size, a32::A32);
vixl::aarch32::MacroAssembler emit(static_cast<vixl::byte*>(host_pc), lbi.code_size, a32::A32);
// check jump distance
const s32 displacement = GetPCDisplacement(lbi.host_pc, lbi.host_slowmem_pc);
if (!IsPCDisplacementInImmediateRange(displacement))
const s32 displacement = armGetPCDisplacement(host_pc, lbi.thunk_address);
if (!armIsPCDisplacementInImmediateRange(displacement))
{
emit.Mov(GetHostReg32(RSCRATCH), reinterpret_cast<uintptr_t>(lbi.host_slowmem_pc));
armMoveAddressToReg(&emit, GetHostReg32(RSCRATCH), lbi.thunk_address);
emit.bx(GetHostReg32(RSCRATCH));
}
else
@ -1572,56 +1702,12 @@ bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi)
emit.b(&label);
}
const s32 nops = (static_cast<s32>(lbi.host_code_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
const s32 nops = (static_cast<s32>(lbi.code_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size);
return true;
}
void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size)
{
Log_ProfilePrintf("Backpatching %p to return", pc);
vixl::aarch32::MacroAssembler emit(static_cast<vixl::byte*>(pc), pc_size, a32::A32);
emit.bx(a32::lr);
const s32 nops = (static_cast<s32>(pc_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(pc, pc_size);
}
void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target)
{
Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target);
vixl::aarch32::MacroAssembler emit(static_cast<vixl::byte*>(pc), pc_size, a32::A32);
// check jump distance
const s32 displacement = GetPCDisplacement(pc, target);
if (!IsPCDisplacementInImmediateRange(displacement))
{
emit.Mov(GetHostReg32(RSCRATCH), reinterpret_cast<uintptr_t>(target));
emit.bx(GetHostReg32(RSCRATCH));
}
else
{
a32::Label label(displacement + emit.GetCursorOffset());
emit.b(&label);
}
// shouldn't have any nops
const s32 nops = (static_cast<s32>(pc_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(pc, pc_size);
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)
@ -1751,7 +1837,8 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
{
if (GetSegmentForAddress(m_pc) >= Segment::KSEG1)
{
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_block->uncached_fetch_ticks)));
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(m_block->uncached_fetch_ticks)));
}
else
{
@ -1789,6 +1876,82 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
}
}
void CodeGenerator::EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
{
// store it first to reduce code size, because we can offset
armMoveAddressToReg(m_emit, GetHostReg32(RARG1), ram_ptr);
armMoveAddressToReg(m_emit, GetHostReg32(RARG2), shadow_ptr);
u32 offset = 0;
a32::Label block_changed;
#if 0
/* TODO: Vectorize
#include <arm_neon.h>
#include <stdint.h>
bool foo(const void* a, const void* b)
{
uint8x16_t v1 = vld1q_u8((const uint8_t*)a);
uint8x16_t v2 = vld1q_u8((const uint8_t*)b);
uint8x16_t v3 = vld1q_u8((const uint8_t*)a + 16);
uint8x16_t v4 = vld1q_u8((const uint8_t*)a + 16);
uint8x16_t r = vceqq_u8(v1, v2);
uint8x16_t r2 = vceqq_u8(v2, v3);
uint8x16_t r3 = vandq_u8(r, r2);
uint32x2_t rr = vpmin_u32(vget_low_u32(vreinterpretq_u32_u8(r3)), vget_high_u32(vreinterpretq_u32_u8(r3)));
if ((vget_lane_u32(rr, 0) & vget_lane_u32(rr, 1)) != 0xFFFFFFFFu)
return false;
else
return true;
}
*/
bool first = true;
while (size >= 16)
{
const a32::VRegister vtmp = a32::v2.V4S();
const a32::VRegister dst = first ? a32::v0.V4S() : a32::v1.V4S();
m_emit->ldr(dst, a32::MemOperand(RXARG1, offset));
m_emit->ldr(vtmp, a32::MemOperand(RXARG2, offset));
m_emit->cmeq(dst, dst, vtmp);
if (!first)
m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B());
else
first = false;
offset += 16;
size -= 16;
}
if (!first)
{
// TODO: make sure this doesn't choke on ffffffff
m_emit->uminv(a32::s0, a32::v0.V4S());
m_emit->fcmp(a32::s0, 0.0);
m_emit->b(&block_changed, a32::eq);
}
#endif
while (size >= 4)
{
m_emit->ldr(GetHostReg32(RARG3), a32::MemOperand(GetHostReg32(RARG1), offset));
m_emit->ldr(GetHostReg32(RARG4), a32::MemOperand(GetHostReg32(RARG2), offset));
m_emit->cmp(GetHostReg32(RARG3), GetHostReg32(RARG4));
m_emit->b(a32::ne, &block_changed);
offset += 4;
size -= 4;
}
DebugAssert(size == 0);
a32::Label block_unchanged;
m_emit->b(&block_unchanged);
m_emit->bind(&block_changed);
armEmitJmp(m_emit, CodeCache::g_discard_and_recompile_block, false);
m_emit->bind(&block_unchanged);
}
void CodeGenerator::EmitStallUntilGTEComplete()
{
static_assert(offsetof(State, pending_ticks) + sizeof(u32) == offsetof(State, gte_completion_tick));
@ -1809,8 +1972,8 @@ void CodeGenerator::EmitStallUntilGTEComplete()
void CodeGenerator::EmitBranch(const void* address, bool allow_scratch)
{
const s32 displacement = GetPCDisplacement(GetCurrentCodePointer(), address);
if (IsPCDisplacementInImmediateRange(displacement))
const s32 displacement = armGetPCDisplacement(GetCurrentCodePointer(), address);
if (armIsPCDisplacementInImmediateRange(displacement))
{
a32::Label label(displacement + m_emit->GetCursorOffset());
m_emit->b(&label);
@ -2057,81 +2220,4 @@ void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr)
m_emit->Mov(GetHostReg32(host_reg), reinterpret_cast<uintptr_t>(ptr));
}
CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher()
{
m_emit->sub(a32::sp, a32::sp, FUNCTION_STACK_SIZE);
m_register_cache.ReserveCalleeSavedRegisters();
const u32 stack_adjust = PrepareStackForCall();
EmitLoadGlobalAddress(RCPUPTR, &g_state);
a32::Label event_test;
m_emit->b(&event_test);
// main dispatch loop
a32::Label main_loop;
m_emit->Bind(&main_loop);
// time to lookup the block
// r0 <- pc
m_emit->ldr(a32::r0, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, pc)));
// r1 <- s_fast_map[pc >> 16]
EmitLoadGlobalAddress(2, CodeCache::GetFastMapPointer());
m_emit->lsr(a32::r1, a32::r0, 16);
m_emit->ldr(a32::r1, a32::MemOperand(a32::r2, a32::r1, a32::LSL, 2));
// blr(r1[pc]) (fast_map[pc >> 2])
m_emit->ldr(a32::r0, a32::MemOperand(a32::r1, a32::r0));
m_emit->blx(a32::r0);
// r0 <- pending_ticks
// r1 <- downcount
m_emit->ldr(a32::r0, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, pending_ticks)));
m_emit->ldr(a32::r1, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, downcount)));
// while downcount < pending_ticks
a32::Label downcount_hit;
m_emit->cmp(a32::r0, a32::r1);
m_emit->b(a32::lt, &main_loop);
// end while
m_emit->Bind(&event_test);
EmitCall(reinterpret_cast<const void*>(&TimingEvents::RunEvents));
m_emit->b(&main_loop);
RestoreStackAfterCall(stack_adjust);
m_register_cache.PopCalleeSavedRegisters(true);
m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE);
m_emit->bx(a32::lr);
CodeBlock::HostCodePointer ptr;
u32 code_size;
FinalizeBlock(&ptr, &code_size);
Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr);
return reinterpret_cast<CodeCache::DispatcherFunction>(ptr);
}
CodeCache::SingleBlockDispatcherFunction CodeGenerator::CompileSingleBlockDispatcher()
{
m_emit->sub(a32::sp, a32::sp, FUNCTION_STACK_SIZE);
m_register_cache.ReserveCalleeSavedRegisters();
const u32 stack_adjust = PrepareStackForCall();
EmitLoadGlobalAddress(RCPUPTR, &g_state);
m_emit->blx(GetHostReg32(RARG1));
RestoreStackAfterCall(stack_adjust);
m_register_cache.PopCalleeSavedRegisters(true);
m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE);
m_emit->bx(a32::lr);
CodeBlock::HostCodePointer ptr;
u32 code_size;
FinalizeBlock(&ptr, &code_size);
Log_DevPrintf("Single block dispatcher is %u bytes at %p", code_size, ptr);
return reinterpret_cast<CodeCache::SingleBlockDispatcherFunction>(ptr);
}
} // namespace CPU::Recompiler

View file

@ -1,9 +1,10 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "cpu_code_cache_private.h"
#include "cpu_core.h"
#include "cpu_core_private.h"
#include "cpu_recompiler_code_generator.h"
@ -12,8 +13,399 @@
#include "timing_event.h"
Log_SetChannel(CPU::Recompiler);
#ifdef ENABLE_HOST_DISASSEMBLY
#include "vixl/aarch64/disasm-aarch64.h"
#endif
namespace a64 = vixl::aarch64;
namespace CPU::Recompiler {
constexpr u64 FUNCTION_CALLEE_SAVED_SPACE_RESERVE = 80; // 8 registers
constexpr u64 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 bytes
constexpr u64 FUNCTION_STACK_SIZE = FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE;
static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;
static std::unordered_map<const void*, u32> s_trampoline_targets;
static u8* s_trampoline_start_ptr = nullptr;
static u32 s_trampoline_used = 0;
} // namespace CPU::Recompiler
bool CPU::Recompiler::armIsCallerSavedRegister(u32 id)
{
// same on both linux and windows
return (id <= 18);
}
void CPU::Recompiler::armEmitMov(a64::Assembler* armAsm, const a64::Register& rd, u64 imm)
{
DebugAssert(vixl::IsUint32(imm) || vixl::IsInt32(imm) || rd.Is64Bits());
DebugAssert(rd.GetCode() != a64::sp.GetCode());
if (imm == 0)
{
armAsm->mov(rd, a64::Assembler::AppropriateZeroRegFor(rd));
return;
}
// The worst case for size is mov 64-bit immediate to sp:
// * up to 4 instructions to materialise the constant
// * 1 instruction to move to sp
// Immediates on Aarch64 can be produced using an initial value, and zero to
// three move keep operations.
//
// Initial values can be generated with:
// 1. 64-bit move zero (movz).
// 2. 32-bit move inverted (movn).
// 3. 64-bit move inverted.
// 4. 32-bit orr immediate.
// 5. 64-bit orr immediate.
// Move-keep may then be used to modify each of the 16-bit half words.
//
// The code below supports all five initial value generators, and
// applying move-keep operations to move-zero and move-inverted initial
// values.
// Try to move the immediate in one instruction, and if that fails, switch to
// using multiple instructions.
const unsigned reg_size = rd.GetSizeInBits();
if (a64::Assembler::IsImmMovz(imm, reg_size) && !rd.IsSP())
{
// Immediate can be represented in a move zero instruction. Movz can't write
// to the stack pointer.
armAsm->movz(rd, imm);
return;
}
else if (a64::Assembler::IsImmMovn(imm, reg_size) && !rd.IsSP())
{
// Immediate can be represented in a move negative instruction. Movn can't
// write to the stack pointer.
armAsm->movn(rd, rd.Is64Bits() ? ~imm : (~imm & a64::kWRegMask));
return;
}
else if (a64::Assembler::IsImmLogical(imm, reg_size))
{
// Immediate can be represented in a logical orr instruction.
DebugAssert(!rd.IsZero());
armAsm->orr(rd, a64::Assembler::AppropriateZeroRegFor(rd), imm);
return;
}
// Generic immediate case. Imm will be represented by
// [imm3, imm2, imm1, imm0], where each imm is 16 bits.
// A move-zero or move-inverted is generated for the first non-zero or
// non-0xffff immX, and a move-keep for subsequent non-zero immX.
uint64_t ignored_halfword = 0;
bool invert_move = false;
// If the number of 0xffff halfwords is greater than the number of 0x0000
// halfwords, it's more efficient to use move-inverted.
if (vixl::CountClearHalfWords(~imm, reg_size) > vixl::CountClearHalfWords(imm, reg_size))
{
ignored_halfword = 0xffff;
invert_move = true;
}
// Iterate through the halfwords. Use movn/movz for the first non-ignored
// halfword, and movk for subsequent halfwords.
DebugAssert((reg_size % 16) == 0);
bool first_mov_done = false;
for (unsigned i = 0; i < (reg_size / 16); i++)
{
uint64_t imm16 = (imm >> (16 * i)) & 0xffff;
if (imm16 != ignored_halfword)
{
if (!first_mov_done)
{
if (invert_move)
armAsm->movn(rd, ~imm16 & 0xffff, 16 * i);
else
armAsm->movz(rd, imm16, 16 * i);
first_mov_done = true;
}
else
{
// Construct a wider constant.
armAsm->movk(rd, imm16, 16 * i);
}
}
}
DebugAssert(first_mov_done);
}
s64 CPU::Recompiler::armGetPCDisplacement(const void* current, const void* target)
{
// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);
}
void CPU::Recompiler::armMoveAddressToReg(a64::Assembler* armAsm, const a64::XRegister& reg, const void* addr)
{
const void* cur = armAsm->GetCursorAddress<const void*>();
const void* current_code_ptr_page =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
const void* ptr_page =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmAddSub(page_offset))
{
armAsm->adrp(reg, page_displacement);
armAsm->add(reg, reg, page_offset);
}
else if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64))
{
armAsm->adrp(reg, page_displacement);
armAsm->orr(reg, reg, page_offset);
}
else
{
armEmitMov(armAsm, reg, reinterpret_cast<uintptr_t>(addr));
}
}
void CPU::Recompiler::armEmitJmp(a64::Assembler* armAsm, const void* ptr, bool force_inline)
{
const void* cur = armAsm->GetCursorAddress<const void*>();
s64 displacement = armGetPCDisplacement(cur, ptr);
bool use_blr = !vixl::IsInt26(displacement);
if (use_blr && !force_inline)
{
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
{
displacement = armGetPCDisplacement(cur, trampoline);
use_blr = !vixl::IsInt26(displacement);
}
}
if (use_blr)
{
armMoveAddressToReg(armAsm, RXSCRATCH, ptr);
armAsm->br(RXSCRATCH);
}
else
{
armAsm->b(displacement);
}
}
void CPU::Recompiler::armEmitCall(a64::Assembler* armAsm, const void* ptr, bool force_inline)
{
const void* cur = armAsm->GetCursorAddress<const void*>();
s64 displacement = armGetPCDisplacement(cur, ptr);
bool use_blr = !vixl::IsInt26(displacement);
if (use_blr && !force_inline)
{
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
{
displacement = armGetPCDisplacement(cur, trampoline);
use_blr = !vixl::IsInt26(displacement);
}
}
if (use_blr)
{
armMoveAddressToReg(armAsm, RXSCRATCH, ptr);
armAsm->blr(RXSCRATCH);
}
else
{
armAsm->bl(displacement);
}
}
void CPU::Recompiler::armEmitCondBranch(a64::Assembler* armAsm, a64::Condition cond, const void* ptr)
{
const s64 jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -
reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));
// pxAssert(Common::IsAligned(jump_distance, 4));
if (a64::Instruction::IsValidImmPCOffset(a64::CondBranchType, jump_distance >> 2))
{
armAsm->b(jump_distance >> 2, cond);
}
else
{
a64::Label branch_not_taken;
armAsm->b(&branch_not_taken, InvertCondition(cond));
const s64 new_jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -
reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));
armAsm->b(new_jump_distance >> 2);
armAsm->bind(&branch_not_taken);
}
}
u8* CPU::Recompiler::armGetJumpTrampoline(const void* target)
{
auto it = s_trampoline_targets.find(target);
if (it != s_trampoline_targets.end())
return s_trampoline_start_ptr + it->second;
// align to 16 bytes?
const u32 offset = s_trampoline_used; // Common::AlignUpPow2(s_trampoline_used, 16);
// 4 movs plus a jump
if (TRAMPOLINE_AREA_SIZE - offset < 20)
{
Panic("Ran out of space in constant pool");
return nullptr;
}
u8* start = s_trampoline_start_ptr + offset;
a64::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);
armMoveAddressToReg(&armAsm, RXSCRATCH, target);
armAsm.br(RXSCRATCH);
const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());
DebugAssert(size < 20);
s_trampoline_targets.emplace(target, offset);
s_trampoline_used = offset + static_cast<u32>(size);
JitCodeBuffer::FlushInstructionCache(start, size);
return start;
}
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
{
#ifdef ENABLE_HOST_DISASSEMBLY
class MyDisassembler : public a64::Disassembler
{
protected:
void ProcessOutput(const a64::Instruction* instr) override
{
Log_DebugPrintf("0x%016" PRIx64 " %08" PRIx32 "\t\t%s", reinterpret_cast<uint64_t>(instr),
instr->GetInstructionBits(), GetOutput());
}
};
a64::Decoder decoder;
MyDisassembler disas;
decoder.AppendVisitor(&disas);
decoder.Decode(static_cast<const a64::Instruction*>(start),
reinterpret_cast<const a64::Instruction*>(static_cast<const u8*>(start) + size));
#else
Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY.");
#endif
}
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
{
return size / a64::kInstructionSize;
}
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
{
using namespace a64;
using namespace CPU::Recompiler;
const s64 disp = armGetPCDisplacement(code, dst);
DebugAssert(vixl::IsInt26(disp));
const u32 new_code = B | Assembler::ImmUncondBranch(disp);
std::memcpy(code, &new_code, sizeof(new_code));
if (flush_icache)
JitCodeBuffer::FlushInstructionCache(code, kInstructionSize);
return kInstructionSize;
}
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
{
using namespace vixl::aarch64;
using namespace CPU::Recompiler;
#define PTR(x) a64::MemOperand(RSTATE, (s64)(((u8*)(x)) - ((u8*)&g_state)))
Assembler actual_asm(static_cast<u8*>(code), code_size);
Assembler* armAsm = &actual_asm;
#ifdef VIXL_DEBUG
vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
#endif
Label dispatch;
g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();
{
// reserve some space for saving caller-saved registers
armAsm->sub(sp, sp, CPU::Recompiler::FUNCTION_STACK_SIZE);
// Need the CPU state for basically everything :-)
armMoveAddressToReg(armAsm, RSTATE, &g_state);
// Fastmem setup, oldrec doesn't need it
if (IsUsingFastmem() && g_settings.cpu_execution_mode != CPUExecutionMode::Recompiler)
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
// Fall through to event dispatcher
}
// check events then for frame done
g_check_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
{
Label skip_event_check;
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
armAsm->cmp(RWARG1, RWARG2);
armAsm->b(&skip_event_check, lt);
g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
armAsm->bind(&skip_event_check);
}
// TODO: align?
g_dispatcher = armAsm->GetCursorAddress<const void*>();
{
armAsm->bind(&dispatch);
// x9 <- s_fast_map[pc >> 16]
armAsm->ldr(RWARG1, PTR(&g_state.pc));
armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data());
armAsm->lsr(RWARG2, RWARG1, 16);
armAsm->lsr(RWARG1, RWARG1, 2);
armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3));
// blr(x9[pc * 2]) (fast_map[pc >> 2])
armAsm->ldr(RXARG1, MemOperand(RXARG2, RXARG1, LSL, 3));
armAsm->blr(RXARG1);
}
g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
{
armAsm->ldr(RWARG1, PTR(&g_state.pc));
armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);
armAsm->b(&dispatch);
}
g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
{
armAsm->ldr(RWARG1, PTR(&g_state.pc));
armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);
armAsm->b(&dispatch);
}
g_interpret_block = armAsm->GetCursorAddress<const void*>();
{
armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
armAsm->b(&dispatch);
}
armAsm->FinalizeCode();
// TODO: align?
s_trampoline_targets.clear();
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
s_trampoline_used = 0;
#undef PTR
return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;
}
namespace CPU::Recompiler {
constexpr HostReg RCPUPTR = 19;
@ -24,18 +416,6 @@ constexpr HostReg RARG2 = 1;
constexpr HostReg RARG3 = 2;
constexpr HostReg RARG4 = 3;
constexpr HostReg RSCRATCH = 8;
constexpr u64 FUNCTION_CALL_SHADOW_SPACE = 32;
constexpr u64 FUNCTION_CALLEE_SAVED_SPACE_RESERVE = 80; // 8 registers
constexpr u64 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 bytes
constexpr u64 FUNCTION_STACK_SIZE =
FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE + FUNCTION_CALL_SHADOW_SPACE;
static s64 GetPCDisplacement(const void* current, const void* target)
{
Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);
}
static const a64::WRegister GetHostReg8(HostReg reg)
{
@ -158,6 +538,11 @@ void CodeGenerator::SwitchToNearCode()
m_emit = &m_near_emitter;
}
void* CodeGenerator::GetStartNearCodePointer() const
{
return static_cast<u8*>(m_code_buffer->GetFreeCodePointer());
}
void* CodeGenerator::GetCurrentNearCodePointer() const
{
return static_cast<u8*>(m_code_buffer->GetFreeCodePointer()) + m_near_emitter.GetCursorOffset();
@ -196,8 +581,6 @@ Value CodeGenerator::GetValueInHostOrScratchRegister(const Value& value, bool al
void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
{
m_emit->Sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE);
if (allocate_registers)
{
// Save the link register, since we'll be calling functions.
@ -213,7 +596,7 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
UNREFERENCED_VARIABLE(cpu_reg_allocated);
// If there's loadstore instructions, preload the fastmem base.
if (m_block->contains_loadstore_instructions)
if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions))
{
const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR);
Assert(fastmem_reg_allocated);
@ -222,11 +605,11 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
}
}
void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */)
void CodeGenerator::EmitEndBlock(bool free_registers, const void* jump_to)
{
if (free_registers)
{
if (m_block->contains_loadstore_instructions)
if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions))
m_register_cache.FreeHostReg(RMEMBASEPTR);
m_register_cache.FreeHostReg(RCPUPTR);
@ -235,10 +618,8 @@ void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_ret
m_register_cache.PopCalleeSavedRegisters(true);
}
m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE);
if (emit_return)
m_emit->Ret();
if (jump_to)
armEmitJmp(m_emit, jump_to, true);
}
void CodeGenerator::EmitExceptionExit()
@ -252,8 +633,7 @@ void CodeGenerator::EmitExceptionExit()
m_register_cache.PopCalleeSavedRegisters(false);
m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE);
m_emit->Ret();
armEmitJmp(m_emit, CodeCache::g_check_events_and_dispatch, true);
}
void CodeGenerator::EmitExceptionExitOnBool(const Value& value)
@ -275,19 +655,22 @@ void CodeGenerator::EmitExceptionExitOnBool(const Value& value)
m_register_cache.PopState();
}
void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size)
const void* CodeGenerator::FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size)
{
m_near_emitter.FinalizeCode();
m_far_emitter.FinalizeCode();
*out_host_code = reinterpret_cast<CodeBlock::HostCodePointer>(m_code_buffer->GetFreeCodePointer());
const void* code = m_code_buffer->GetFreeCodePointer();
*out_host_code_size = static_cast<u32>(m_near_emitter.GetSizeOfCodeGenerated());
*out_host_far_code_size = static_cast<u32>(m_far_emitter.GetSizeOfCodeGenerated());
m_code_buffer->CommitCode(static_cast<u32>(m_near_emitter.GetSizeOfCodeGenerated()));
m_code_buffer->CommitFarCode(static_cast<u32>(m_far_emitter.GetSizeOfCodeGenerated()));
m_near_emitter.Reset();
m_far_emitter.Reset();
return code;
}
void CodeGenerator::EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size)
@ -1028,7 +1411,7 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size)
void CodeGenerator::EmitCall(const void* ptr)
{
const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr);
const s64 displacement = armGetPCDisplacement(GetCurrentCodePointer(), ptr);
const bool use_blr = !vixl::IsInt26(displacement);
if (use_blr)
{
@ -1173,25 +1556,25 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co
void CodeGenerator::EmitPushHostReg(HostReg reg, u32 position)
{
const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8));
const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - (position * 8));
m_emit->str(GetHostReg64(reg), addr);
}
void CodeGenerator::EmitPushHostRegPair(HostReg reg, HostReg reg2, u32 position)
{
const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - ((position + 1) * 8));
const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - ((position + 1) * 8));
m_emit->stp(GetHostReg64(reg2), GetHostReg64(reg), addr);
}
void CodeGenerator::EmitPopHostReg(HostReg reg, u32 position)
{
const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8));
const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - (position * 8));
m_emit->ldr(GetHostReg64(reg), addr);
}
void CodeGenerator::EmitPopHostRegPair(HostReg reg, HostReg reg2, u32 position)
{
const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8));
const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - (position * 8));
m_emit->ldp(GetHostReg64(reg2), GetHostReg64(reg), addr);
}
@ -1399,15 +1782,11 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size,
}
}
void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
Value& result)
void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, Value& result)
{
// fastmem
LoadStoreBackpatchInfo bpi;
bpi.address_host_reg = HostReg_Invalid;
bpi.value_host_reg = result.host_reg;
bpi.guest_pc = m_current_instruction->pc;
bpi.fault_count = 0;
void* host_pc = GetCurrentNearCodePointer();
HostReg address_reg;
if (address.IsConstant())
@ -1424,7 +1803,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap)
{
bpi.host_pc = GetCurrentNearCodePointer();
host_pc = GetCurrentNearCodePointer();
switch (size)
{
@ -1451,7 +1830,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
m_emit->and_(GetHostReg32(RARG2), GetHostReg32(address_reg), Bus::FASTMEM_LUT_PAGE_MASK);
m_emit->ldr(GetHostReg64(RARG1), a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RARG1), a64::LSL, 3));
bpi.host_pc = GetCurrentNearCodePointer();
host_pc = GetCurrentNearCodePointer();
switch (size)
{
@ -1473,11 +1852,11 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
}
}
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
const u32 host_code_size =
static_cast<u32>(static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc)));
// generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
const void* host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
@ -1485,7 +1864,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
@ -1496,11 +1875,11 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
SwitchToNearCode();
m_register_cache.UninhibitAllocation();
m_block->loadstore_backpatch_info.push_back(bpi);
CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc);
}
void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
Value& result, bool in_far_code)
void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, Value& result, bool in_far_code)
{
if (g_settings.cpu_recompiler_memory_exceptions)
{
@ -1540,7 +1919,7 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi,
m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2);
EmitOr(result.host_reg, result.host_reg,
Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException(
static_cast<Exception>(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n)));
static_cast<Exception>(0), info.is_branch_delay_slot, false, instruction.cop.cop_n)));
EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
EmitExceptionExit();
@ -1573,17 +1952,13 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi,
}
}
void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
const Value& value)
void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value)
{
Value value_in_hr = GetValueInHostRegister(value);
// fastmem
LoadStoreBackpatchInfo bpi;
bpi.address_host_reg = HostReg_Invalid;
bpi.value_host_reg = value.host_reg;
bpi.guest_pc = m_current_instruction->pc;
bpi.fault_count = 0;
void* host_pc = GetCurrentNearCodePointer();
HostReg address_reg;
if (address.IsConstant())
@ -1599,7 +1974,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
m_register_cache.InhibitAllocation();
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap)
{
bpi.host_pc = GetCurrentNearCodePointer();
host_pc = GetCurrentNearCodePointer();
switch (size)
{
@ -1627,7 +2002,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
m_emit->add(GetHostReg64(RARG3), GetFastmemBasePtrReg(), Bus::FASTMEM_LUT_NUM_PAGES * sizeof(u32*));
m_emit->ldr(GetHostReg64(RARG1), a64::MemOperand(GetHostReg64(RARG3), GetHostReg32(RARG1), a64::LSL, 3));
bpi.host_pc = GetCurrentNearCodePointer();
host_pc = GetCurrentNearCodePointer();
switch (size)
{
@ -1649,17 +2024,17 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
}
}
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
const u32 host_code_size =
static_cast<u32>(static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc)));
// generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
void* host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
EmitStoreGuestMemorySlowmem(cbi, address, size, value_in_hr, true);
EmitStoreGuestMemorySlowmem(instruction, info, address, size, value_in_hr, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
@ -1670,11 +2045,12 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
SwitchToNearCode();
m_register_cache.UninhibitAllocation();
m_block->loadstore_backpatch_info.push_back(bpi);
CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_slowmem_pc);
}
void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
const Value& value, bool in_far_code)
void CodeGenerator::EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value,
bool in_far_code)
{
Value value_in_hr = GetValueInHostRegister(value);
@ -1717,7 +2093,7 @@ void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi,
m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2);
EmitOr(result.host_reg, result.host_reg,
Value::FromConstantU32(Cop0Registers::CAUSE::MakeValueForException(
static_cast<Exception>(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n)));
static_cast<Exception>(0), info.is_branch_delay_slot, false, instruction.cop.cop_n)));
EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
if (!in_far_code)
@ -1754,64 +2130,26 @@ void CodeGenerator::EmitUpdateFastmemBase()
m_emit->Ldr(GetFastmemBasePtrReg(), a64::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base)));
}
bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi)
void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi)
{
Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem at %p", lbi.host_pc, lbi.guest_pc, lbi.host_slowmem_pc);
Log_DevFmt("Backpatching {} (guest PC 0x{:08X}) to slowmem at {}", host_pc, lbi.guest_pc, lbi.thunk_address);
// check jump distance
const s64 jump_distance =
static_cast<s64>(reinterpret_cast<intptr_t>(lbi.host_slowmem_pc) - reinterpret_cast<intptr_t>(lbi.host_pc));
static_cast<s64>(reinterpret_cast<intptr_t>(lbi.thunk_address) - reinterpret_cast<intptr_t>(host_pc));
Assert(Common::IsAligned(jump_distance, 4));
Assert(a64::Instruction::IsValidImmPCOffset(a64::UncondBranchType, jump_distance >> 2));
// turn it into a jump to the slowmem handler
vixl::aarch64::MacroAssembler emit(static_cast<vixl::byte*>(lbi.host_pc), lbi.host_code_size,
a64::PositionDependentCode);
vixl::aarch64::MacroAssembler emit(static_cast<vixl::byte*>(host_pc), lbi.code_size, a64::PositionDependentCode);
emit.b(jump_distance >> 2);
const s32 nops = (static_cast<s32>(lbi.host_code_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
const s32 nops = (static_cast<s32>(lbi.code_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size);
return true;
}
void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size)
{
Log_ProfilePrintf("Backpatching %p to return", pc);
vixl::aarch64::MacroAssembler emit(static_cast<vixl::byte*>(pc), pc_size, a64::PositionDependentCode);
emit.ret();
const s32 nops = (static_cast<s32>(pc_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(pc, pc_size);
}
void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target)
{
Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target);
// check jump distance
const s64 jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(target) - reinterpret_cast<intptr_t>(pc));
Assert(Common::IsAligned(jump_distance, 4));
Assert(a64::Instruction::IsValidImmPCOffset(a64::UncondBranchType, jump_distance >> 2));
vixl::aarch64::MacroAssembler emit(static_cast<vixl::byte*>(pc), pc_size, a64::PositionDependentCode);
emit.b(jump_distance >> 2);
// shouldn't have any nops
const s32 nops = (static_cast<s32>(pc_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
emit.nop();
JitCodeBuffer::FlushInstructionCache(pc, pc_size);
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)
@ -1980,6 +2318,69 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
}
}
void CodeGenerator::EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
{
// store it first to reduce code size, because we can offset
armMoveAddressToReg(m_emit, RXARG1, ram_ptr);
armMoveAddressToReg(m_emit, RXARG2, shadow_ptr);
bool first = true;
u32 offset = 0;
a64::Label block_changed;
while (size >= 16)
{
const a64::VRegister vtmp = a64::v2.V4S();
const a64::VRegister dst = first ? a64::v0.V4S() : a64::v1.V4S();
m_emit->ldr(dst, a64::MemOperand(RXARG1, offset));
m_emit->ldr(vtmp, a64::MemOperand(RXARG2, offset));
m_emit->cmeq(dst, dst, vtmp);
if (!first)
m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B());
else
first = false;
offset += 16;
size -= 16;
}
if (!first)
{
// TODO: make sure this doesn't choke on ffffffff
m_emit->uminv(a64::s0, a64::v0.V4S());
m_emit->fcmp(a64::s0, 0.0);
m_emit->b(&block_changed, a64::eq);
}
while (size >= 8)
{
m_emit->ldr(RXARG3, a64::MemOperand(RXARG1, offset));
m_emit->ldr(RXSCRATCH, a64::MemOperand(RXARG2, offset));
m_emit->cmp(RXARG3, RXSCRATCH);
m_emit->b(&block_changed, a64::ne);
offset += 8;
size -= 8;
}
while (size >= 4)
{
m_emit->ldr(RWARG3, a64::MemOperand(RXARG1, offset));
m_emit->ldr(RWSCRATCH, a64::MemOperand(RXARG2, offset));
m_emit->cmp(RWARG3, RWSCRATCH);
m_emit->b(&block_changed, a64::ne);
offset += 4;
size -= 4;
}
DebugAssert(size == 0);
a64::Label block_unchanged;
m_emit->b(&block_unchanged);
m_emit->bind(&block_changed);
armEmitJmp(m_emit, CodeCache::g_discard_and_recompile_block, false);
m_emit->bind(&block_unchanged);
}
void CodeGenerator::EmitStallUntilGTEComplete()
{
static_assert(offsetof(State, pending_ticks) + sizeof(u32) == offsetof(State, gte_completion_tick));
@ -2253,7 +2654,7 @@ void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr)
reinterpret_cast<uintptr_t>(GetCurrentCodePointer()) & ~static_cast<uintptr_t>(0xFFF));
const void* ptr_page =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(ptr) & ~static_cast<uintptr_t>(0xFFF));
const s64 page_displacement = GetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(ptr) & 0xFFFu);
if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64))
{
@ -2266,81 +2667,4 @@ void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr)
}
}
CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher()
{
m_emit->sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE);
m_register_cache.ReserveCalleeSavedRegisters();
const u32 stack_adjust = PrepareStackForCall();
EmitLoadGlobalAddress(RCPUPTR, &g_state);
a64::Label event_test;
m_emit->b(&event_test);
// main dispatch loop
a64::Label main_loop;
m_emit->Bind(&main_loop);
// time to lookup the block
// w8 <- pc
m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, pc)));
// x9 <- s_fast_map[pc >> 16]
EmitLoadGlobalAddress(10, CodeCache::GetFastMapPointer());
m_emit->lsr(a64::w9, a64::w8, 16);
m_emit->lsr(a64::w8, a64::w8, 2);
m_emit->ldr(a64::x9, a64::MemOperand(a64::x10, a64::x9, a64::LSL, 3));
// blr(x9[pc * 2]) (fast_map[pc >> 2])
m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 3));
m_emit->blr(a64::x8);
// w8 <- pending_ticks
// w9 <- downcount
m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, pending_ticks)));
m_emit->ldr(a64::w9, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, downcount)));
// while downcount < pending_ticks
m_emit->cmp(a64::w8, a64::w9);
m_emit->b(&main_loop, a64::lt);
m_emit->Bind(&event_test);
EmitCall(reinterpret_cast<const void*>(&TimingEvents::RunEvents));
m_emit->b(&main_loop);
// all done
RestoreStackAfterCall(stack_adjust);
m_register_cache.PopCalleeSavedRegisters(true);
m_emit->add(a64::sp, a64::sp, FUNCTION_STACK_SIZE);
m_emit->ret();
CodeBlock::HostCodePointer ptr;
u32 code_size;
FinalizeBlock(&ptr, &code_size);
Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr);
return reinterpret_cast<CodeCache::DispatcherFunction>(ptr);
}
CodeCache::SingleBlockDispatcherFunction CodeGenerator::CompileSingleBlockDispatcher()
{
m_emit->sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE);
m_register_cache.ReserveCalleeSavedRegisters();
const u32 stack_adjust = PrepareStackForCall();
EmitLoadGlobalAddress(RCPUPTR, &g_state);
m_emit->blr(GetHostReg64(RARG1));
RestoreStackAfterCall(stack_adjust);
m_register_cache.PopCalleeSavedRegisters(true);
m_emit->add(a64::sp, a64::sp, FUNCTION_STACK_SIZE);
m_emit->ret();
CodeBlock::HostCodePointer ptr;
u32 code_size;
FinalizeBlock(&ptr, &code_size);
Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr);
return reinterpret_cast<CodeCache::SingleBlockDispatcherFunction>(ptr);
}
} // namespace CPU::Recompiler

View file

@ -29,8 +29,8 @@ void CodeGenerator::EmitStoreInterpreterLoadDelay(Reg reg, const Value& value)
m_load_delay_dirty = true;
}
Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address,
const SpeculativeValue& address_spec, RegSize size)
Value CodeGenerator::EmitLoadGuestMemory(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, const SpeculativeValue& address_spec, RegSize size)
{
if (address.IsConstant() && !SpeculativeIsCacheIsolated())
{
@ -44,7 +44,8 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
{
Value result = m_register_cache.AllocateScratch(size);
if (g_settings.IsUsingFastmem() && Bus::IsRAMAddress(static_cast<u32>(address.constant_value)))
// TODO: mask off...
if (CodeCache::IsUsingFastmem() && Bus::IsRAMAddress(static_cast<u32>(address.constant_value)))
{
// have to mask away the high bits for mirrors, since we don't map them in fastmem
EmitLoadGuestRAMFastmem(Value::FromConstantU32(static_cast<u32>(address.constant_value) & Bus::g_ram_mask),
@ -68,25 +69,25 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
{
if (!use_fastmem)
{
Log_ProfilePrintf("Non-constant load at 0x%08X, speculative address 0x%08X, using fastmem = %s", cbi.pc,
Log_ProfilePrintf("Non-constant load at 0x%08X, speculative address 0x%08X, using fastmem = %s", info.pc,
*address_spec, use_fastmem ? "yes" : "no");
}
}
else
{
Log_ProfilePrintf("Non-constant load at 0x%08X, speculative address UNKNOWN, using fastmem = %s", cbi.pc,
Log_ProfilePrintf("Non-constant load at 0x%08X, speculative address UNKNOWN, using fastmem = %s", info.pc,
use_fastmem ? "yes" : "no");
}
if (g_settings.IsUsingFastmem() && use_fastmem)
if (CodeCache::IsUsingFastmem() && use_fastmem)
{
EmitLoadGuestMemoryFastmem(cbi, address, size, result);
EmitLoadGuestMemoryFastmem(instruction, info, address, size, result);
}
else
{
AddPendingCycles(true);
m_register_cache.FlushCallerSavedGuestRegisters(true, true);
EmitLoadGuestMemorySlowmem(cbi, address, size, result, false);
EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, false);
}
// Downcast to ignore upper 56/48/32 bits. This should be a noop.
@ -115,8 +116,9 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
return result;
}
void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address,
const SpeculativeValue& address_spec, RegSize size, const Value& value)
void CodeGenerator::EmitStoreGuestMemory(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, const SpeculativeValue& address_spec, RegSize size,
const Value& value)
{
if (address.IsConstant() && !SpeculativeIsCacheIsolated())
{
@ -141,25 +143,25 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
{
if (!use_fastmem)
{
Log_ProfilePrintf("Non-constant store at 0x%08X, speculative address 0x%08X, using fastmem = %s", cbi.pc,
Log_ProfilePrintf("Non-constant store at 0x%08X, speculative address 0x%08X, using fastmem = %s", info.pc,
*address_spec, use_fastmem ? "yes" : "no");
}
}
else
{
Log_ProfilePrintf("Non-constant store at 0x%08X, speculative address UNKNOWN, using fastmem = %s", cbi.pc,
Log_ProfilePrintf("Non-constant store at 0x%08X, speculative address UNKNOWN, using fastmem = %s", info.pc,
use_fastmem ? "yes" : "no");
}
if (g_settings.IsUsingFastmem() && use_fastmem)
if (CodeCache::IsUsingFastmem() && use_fastmem)
{
EmitStoreGuestMemoryFastmem(cbi, address, size, value);
EmitStoreGuestMemoryFastmem(instruction, info, address, size, value);
}
else
{
AddPendingCycles(true);
m_register_cache.FlushCallerSavedGuestRegisters(true, true);
EmitStoreGuestMemorySlowmem(cbi, address, size, value, false);
EmitStoreGuestMemorySlowmem(instruction, info, address, size, value, false);
}
}

View file

@ -1,38 +1,315 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
#include "cpu_code_cache_private.h"
#include "cpu_core.h"
#include "cpu_core_private.h"
#include "cpu_recompiler_code_generator.h"
#include "cpu_recompiler_thunks.h"
#include "settings.h"
#include "timing_event.h"
#include "common/align.h"
#include "common/assert.h"
#include "common/log.h"
Log_SetChannel(Recompiler::CodeGenerator);
#ifdef ENABLE_HOST_DISASSEMBLY
#include "Zycore/Format.h"
#include "Zycore/Status.h"
#include "Zydis/Zydis.h"
#endif
bool CPU::Recompiler::IsCallerSavedRegister(u32 id)
{
#ifdef _WIN32
// The x64 ABI considers the registers RAX, RCX, RDX, R8, R9, R10, R11, and XMM0-XMM5 volatile.
return (id <= 2 || (id >= 8 && id <= 11));
#else
// rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 are scratch registers.
return (id <= 2 || id == 6 || id == 7 || (id >= 8 && id <= 11));
#endif
}
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
{
using namespace Xbyak;
#define PTR(x) (cg->rbp + (((u8*)(x)) - ((u8*)&g_state)))
#ifdef _WIN32
// Shadow space for Win32
constexpr u32 stack_size = 32 + 8;
#else
// Stack still needs to be aligned
constexpr u32 stack_size = 8;
#endif
DebugAssert(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler);
CodeGenerator acg(code_size, static_cast<u8*>(code));
CodeGenerator* cg = &acg;
Label dispatch;
Label exit_recompiler;
g_enter_recompiler = reinterpret_cast<decltype(g_enter_recompiler)>(const_cast<u8*>(cg->getCurr()));
{
// Don't need to save registers, because we fastjmp out when execution is interrupted.
cg->sub(cg->rsp, stack_size);
// CPU state pointer
cg->lea(cg->rbp, cg->qword[cg->rip + &g_state]);
// newrec preloads fastmem base
if (g_settings.cpu_execution_mode != CPUExecutionMode::Recompiler && CodeCache::IsUsingFastmem())
cg->mov(cg->rbx, cg->qword[PTR(&g_state.fastmem_base)]);
// Fall through to event dispatcher
}
// check events then for frame done
g_check_events_and_dispatch = cg->getCurr();
{
Label skip_event_check;
cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]);
cg->jl(skip_event_check);
g_run_events_and_dispatch = cg->getCurr();
cg->call(reinterpret_cast<const void*>(&TimingEvents::RunEvents));
cg->L(skip_event_check);
}
// TODO: align?
g_dispatcher = cg->getCurr();
{
cg->L(dispatch);
// rcx <- s_fast_map[pc >> 16]
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
cg->lea(RXARG2, cg->dword[PTR(g_code_lut.data())]);
cg->mov(RWARG3, RWARG1);
cg->shr(RWARG3, 16);
cg->mov(RXARG2, cg->qword[RXARG2 + RXARG3 * 8]);
// call(rcx[pc * 2]) (fast_map[pc >> 2])
cg->jmp(cg->qword[RXARG2 + RXARG1 * 2]);
}
g_compile_or_revalidate_block = cg->getCurr();
{
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
cg->call(&CompileOrRevalidateBlock);
cg->jmp(dispatch);
}
g_discard_and_recompile_block = cg->getCurr();
{
cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]);
cg->call(&DiscardAndRecompileBlock);
cg->jmp(dispatch);
}
g_interpret_block = cg->getCurr();
{
cg->call(CodeCache::GetInterpretUncachedBlockFunction());
cg->jmp(dispatch);
}
#undef PTR
return static_cast<u32>(cg->getSize());
}
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
{
u8* ptr = static_cast<u8*>(code);
*(ptr++) = 0xE9; // jmp
const ptrdiff_t disp = (reinterpret_cast<uintptr_t>(dst) - reinterpret_cast<uintptr_t>(code)) - 5;
DebugAssert(disp >= static_cast<ptrdiff_t>(std::numeric_limits<s32>::min()) &&
disp <= static_cast<ptrdiff_t>(std::numeric_limits<s32>::max()));
const s32 disp32 = static_cast<s32>(disp);
std::memcpy(ptr, &disp32, sizeof(disp32));
return 5;
}
#ifdef ENABLE_HOST_DISASSEMBLY
static ZydisFormatterFunc s_old_print_address;
static ZyanStatus ZydisFormatterPrintAddressAbsolute(const ZydisFormatter* formatter, ZydisFormatterBuffer* buffer,
ZydisFormatterContext* context)
{
using namespace CPU;
ZyanU64 address;
ZYAN_CHECK(ZydisCalcAbsoluteAddress(context->instruction, context->operand, context->runtime_address, &address));
char buf[128];
u32 len = 0;
#define A(x) static_cast<ZyanU64>(reinterpret_cast<uintptr_t>(x))
if (address >= A(Bus::g_ram) && address < A(Bus::g_ram + Bus::g_ram_size))
{
len = snprintf(buf, sizeof(buf), "g_ram+0x%08X", static_cast<u32>(address - A(Bus::g_ram)));
}
else if (address >= A(&g_state.regs) &&
address < A(reinterpret_cast<const u8*>(&g_state.regs) + sizeof(CPU::Registers)))
{
len = snprintf(buf, sizeof(buf), "g_state.regs.%s",
GetRegName(static_cast<CPU::Reg>(((address - A(&g_state.regs.r[0])) / 4u))));
}
else if (address >= A(&g_state.cop0_regs) &&
address < A(reinterpret_cast<const u8*>(&g_state.cop0_regs) + sizeof(CPU::Cop0Registers)))
{
for (const DebuggerRegisterListEntry& rle : g_debugger_register_list)
{
if (address == static_cast<ZyanU64>(reinterpret_cast<uintptr_t>(rle.value_ptr)))
{
len = snprintf(buf, sizeof(buf), "g_state.cop0_regs.%s", rle.name);
break;
}
}
}
else if (address >= A(&g_state.gte_regs) &&
address < A(reinterpret_cast<const u8*>(&g_state.gte_regs) + sizeof(GTE::Regs)))
{
for (const DebuggerRegisterListEntry& rle : g_debugger_register_list)
{
if (address == static_cast<ZyanU64>(reinterpret_cast<uintptr_t>(rle.value_ptr)))
{
len = snprintf(buf, sizeof(buf), "g_state.gte_regs.%s", rle.name);
break;
}
}
}
else if (address == A(&g_state.load_delay_reg))
{
len = snprintf(buf, sizeof(buf), "g_state.load_delay_reg");
}
else if (address == A(&g_state.next_load_delay_reg))
{
len = snprintf(buf, sizeof(buf), "g_state.next_load_delay_reg");
}
else if (address == A(&g_state.load_delay_value))
{
len = snprintf(buf, sizeof(buf), "g_state.load_delay_value");
}
else if (address == A(&g_state.next_load_delay_value))
{
len = snprintf(buf, sizeof(buf), "g_state.next_load_delay_value");
}
else if (address == A(&g_state.pending_ticks))
{
len = snprintf(buf, sizeof(buf), "g_state.pending_ticks");
}
else if (address == A(&g_state.downcount))
{
len = snprintf(buf, sizeof(buf), "g_state.downcount");
}
#undef A
if (len > 0)
{
ZYAN_CHECK(ZydisFormatterBufferAppend(buffer, ZYDIS_TOKEN_SYMBOL));
ZyanString* string;
ZYAN_CHECK(ZydisFormatterBufferGetString(buffer, &string));
return ZyanStringAppendFormat(string, "&%s", buf);
}
return s_old_print_address(formatter, buffer, context);
}
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
{
ZydisDecoder disas_decoder;
ZydisFormatter disas_formatter;
ZydisDecodedInstruction disas_instruction;
ZydisDecodedOperand disas_operands[ZYDIS_MAX_OPERAND_COUNT];
ZydisDecoderInit(&disas_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
ZydisFormatterInit(&disas_formatter, ZYDIS_FORMATTER_STYLE_INTEL);
s_old_print_address = (ZydisFormatterFunc)&ZydisFormatterPrintAddressAbsolute;
ZydisFormatterSetHook(&disas_formatter, ZYDIS_FORMATTER_FUNC_PRINT_ADDRESS_ABS, (const void**)&s_old_print_address);
const u8* ptr = static_cast<const u8*>(start);
TinyString hex;
ZyanUSize remaining = size;
while (ZYAN_SUCCESS(ZydisDecoderDecodeFull(&disas_decoder, ptr, remaining, &disas_instruction, disas_operands)))
{
char buffer[256];
if (ZYAN_SUCCESS(ZydisFormatterFormatInstruction(&disas_formatter, &disas_instruction, disas_operands,
ZYDIS_MAX_OPERAND_COUNT, buffer, sizeof(buffer),
static_cast<ZyanU64>(reinterpret_cast<uintptr_t>(ptr)), nullptr)))
{
hex.clear();
for (u32 i = 0; i < 10; i++)
{
if (i < disas_instruction.length)
hex.append_fmt(" {:02X}", ptr[i]);
else
hex.append(" ");
}
Log::WriteFmt("HostCode", "", LOGLEVEL_DEBUG, " {:016X} {} {}",
static_cast<u64>(reinterpret_cast<uintptr_t>(ptr)), hex, buffer);
}
ptr += disas_instruction.length;
remaining -= disas_instruction.length;
}
}
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
{
ZydisDecoder disas_decoder;
ZydisDecodedInstruction disas_instruction;
ZydisDecoderContext disas_context;
ZydisDecoderInit(&disas_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
const u8* ptr = static_cast<const u8*>(start);
ZyanUSize remaining = size;
u32 inst_count = 0;
while (
ZYAN_SUCCESS(ZydisDecoderDecodeInstruction(&disas_decoder, &disas_context, ptr, remaining, &disas_instruction)))
{
ptr += disas_instruction.length;
remaining -= disas_instruction.length;
inst_count++;
}
return inst_count;
}
#else
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
{
Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY.");
}
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
{
Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY.");
return 0;
}
#endif // ENABLE_HOST_DISASSEMBLY
namespace CPU::Recompiler {
#if defined(ABI_WIN64)
constexpr HostReg RCPUPTR = Xbyak::Operand::RBP;
constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX;
constexpr HostReg RRETURN = Xbyak::Operand::RAX;
constexpr HostReg RARG1 = Xbyak::Operand::RCX;
constexpr HostReg RARG2 = Xbyak::Operand::RDX;
constexpr HostReg RARG3 = Xbyak::Operand::R8;
constexpr HostReg RARG4 = Xbyak::Operand::R9;
constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32;
#elif defined(ABI_SYSV)
constexpr HostReg RCPUPTR = Xbyak::Operand::RBP;
constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX;
constexpr HostReg RRETURN = Xbyak::Operand::RAX;
constexpr HostReg RARG1 = Xbyak::Operand::RDI;
constexpr HostReg RARG2 = Xbyak::Operand::RSI;
constexpr HostReg RARG3 = Xbyak::Operand::RDX;
constexpr HostReg RARG4 = Xbyak::Operand::RCX;
constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 0;
#endif
static constexpr HostReg RCPUPTR = Xbyak::Operand::RBP;
static constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX;
static constexpr HostReg RRETURN = RXRET.getIdx();
static constexpr HostReg RARG1 = RXARG1.getIdx();
static constexpr HostReg RARG2 = RXARG2.getIdx();
static constexpr HostReg RARG3 = RXARG3.getIdx();
static constexpr HostReg RARG4 = RXARG4.getIdx();
static const Xbyak::Reg8 GetHostReg8(HostReg reg)
{
@ -80,7 +357,7 @@ static const Xbyak::Reg64 GetHostReg64(const Value& value)
static const Xbyak::Reg64 GetCPUPtrReg()
{
return GetHostReg64(RCPUPTR);
return Xbyak::Reg64(RCPUPTR);
}
static const Xbyak::Reg64 GetFastmemBasePtrReg()
@ -177,6 +454,11 @@ void CodeGenerator::SwitchToNearCode()
m_emit = &m_near_emitter;
}
void* CodeGenerator::GetStartNearCodePointer() const
{
return m_near_emitter.getCode<u8*>();
}
void* CodeGenerator::GetCurrentNearCodePointer() const
{
return m_near_emitter.getCurr<void*>();
@ -217,10 +499,9 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR);
DebugAssert(cpu_reg_allocated);
UNREFERENCED_VARIABLE(cpu_reg_allocated);
// m_emit->mov(GetCPUPtrReg(), reinterpret_cast<size_t>(&g_state));
// If there's loadstore instructions, preload the fastmem base.
if (m_block->contains_loadstore_instructions)
if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions))
{
const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR);
DebugAssert(fastmem_reg_allocated);
@ -230,19 +511,19 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */)
}
}
void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */)
void CodeGenerator::EmitEndBlock(bool free_registers, const void* jump_to)
{
if (free_registers)
{
m_register_cache.FreeHostReg(RCPUPTR);
if (m_block->contains_loadstore_instructions)
if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions))
m_register_cache.FreeHostReg(RMEMBASEPTR);
m_register_cache.PopCalleeSavedRegisters(true);
}
if (emit_return)
m_emit->ret();
if (jump_to)
m_emit->jmp(jump_to);
}
void CodeGenerator::EmitExceptionExit()
@ -257,7 +538,7 @@ void CodeGenerator::EmitExceptionExit()
m_register_cache.FlushLoadDelay(false);
m_register_cache.PopCalleeSavedRegisters(false);
m_emit->ret();
m_emit->jmp(CodeCache::g_check_events_and_dispatch);
}
void CodeGenerator::EmitExceptionExitOnBool(const Value& value)
@ -276,20 +557,23 @@ void CodeGenerator::EmitExceptionExitOnBool(const Value& value)
m_register_cache.PopState();
}
void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size)
const void* CodeGenerator::FinalizeBlock(u32* out_host_code_size, u32* out_host_far_code_size)
{
m_near_emitter.ready();
m_far_emitter.ready();
const u32 near_size = static_cast<u32>(m_near_emitter.getSize());
const u32 far_size = static_cast<u32>(m_far_emitter.getSize());
*out_host_code = m_near_emitter.getCode<CodeBlock::HostCodePointer>();
const void* code = m_near_emitter.getCode<const void*>();
*out_host_code_size = near_size;
*out_host_far_code_size = far_size;
m_code_buffer->CommitCode(near_size);
m_code_buffer->CommitFarCode(far_size);
m_near_emitter.reset();
m_far_emitter.reset();
return code;
}
void CodeGenerator::EmitSignExtend(HostReg to_reg, RegSize to_size, HostReg from_reg, RegSize from_size)
@ -1461,8 +1745,9 @@ u32 CodeGenerator::PrepareStackForCall()
// we assume that the stack is unaligned at this point
const u32 num_callee_saved = m_register_cache.GetActiveCalleeSavedRegisterCount();
const u32 num_caller_saved = m_register_cache.PushCallerSavedRegisters();
const u32 current_offset = 8 + (num_callee_saved + num_caller_saved) * 8;
const u32 aligned_offset = Common::AlignUp(current_offset + FUNCTION_CALL_SHADOW_SPACE, 16);
const u32 current_offset = (num_callee_saved + num_caller_saved) * 8;
const u32 aligned_offset =
(current_offset == 0) ? 0 : Common::AlignUp(current_offset + FUNCTION_CALL_SHADOW_SPACE, 16);
const u32 adjust_size = aligned_offset - current_offset;
if (adjust_size > 0)
m_emit->sub(m_emit->rsp, adjust_size);
@ -1902,16 +2187,11 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size,
}
}
void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
Value& result)
void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, Value& result)
{
// fastmem
LoadStoreBackpatchInfo bpi;
bpi.host_pc = GetCurrentNearCodePointer();
bpi.address_host_reg = HostReg_Invalid;
bpi.value_host_reg = result.host_reg;
bpi.guest_pc = m_current_instruction->pc;
bpi.fault_count = 0;
void* host_pc = GetCurrentNearCodePointer();
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap)
{
@ -1921,7 +2201,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
{
actual_address = &result;
m_emit->mov(GetHostReg32(result.host_reg), address.constant_value);
bpi.host_pc = GetCurrentNearCodePointer();
host_pc = GetCurrentNearCodePointer();
}
m_register_cache.InhibitAllocation();
@ -1988,7 +2268,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
m_emit->shr(GetHostReg32(RARG1), Bus::FASTMEM_LUT_PAGE_SHIFT);
m_emit->and_(GetHostReg32(RARG2), Bus::FASTMEM_LUT_PAGE_MASK);
m_emit->mov(GetHostReg64(RARG1), m_emit->qword[GetFastmemBasePtrReg() + GetHostReg64(RARG1) * 8]);
bpi.host_pc = GetCurrentNearCodePointer();
host_pc = GetCurrentNearCodePointer();
switch (size)
{
@ -2011,18 +2291,17 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
}
// insert nops, we need at least 5 bytes for a relative jump
const u32 fastmem_size =
static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc));
const u32 fastmem_size = static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc));
const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0);
for (u32 i = 0; i < nops; i++)
m_emit->nop();
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
const u32 host_code_size =
static_cast<u32>(static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc)));
// generate slowmem fallback
m_far_emitter.align(16);
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
void* thunk_host_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
@ -2030,7 +2309,7 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitLoadGuestMemorySlowmem(instruction, info, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
@ -2041,11 +2320,11 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
SwitchToNearCode();
m_register_cache.UninhibitAllocation();
m_block->loadstore_backpatch_info.push_back(bpi);
CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, thunk_host_pc);
}
void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
Value& result, bool in_far_code)
void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, Value& result, bool in_far_code)
{
if (g_settings.cpu_recompiler_memory_exceptions)
{
@ -2082,8 +2361,8 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi,
m_emit->neg(GetHostReg32(result.host_reg));
m_emit->shl(GetHostReg32(result.host_reg), 2);
m_emit->or_(GetHostReg32(result.host_reg),
Cop0Registers::CAUSE::MakeValueForException(static_cast<Exception>(0), cbi.is_branch_delay_slot, false,
cbi.instruction.cop.cop_n));
Cop0Registers::CAUSE::MakeValueForException(static_cast<Exception>(0), info.is_branch_delay_slot, false,
instruction.cop.cop_n));
EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
EmitExceptionExit();
@ -2116,16 +2395,11 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi,
}
}
void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
const Value& value)
void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value)
{
// fastmem
LoadStoreBackpatchInfo bpi;
bpi.host_pc = GetCurrentNearCodePointer();
bpi.address_host_reg = HostReg_Invalid;
bpi.value_host_reg = value.host_reg;
bpi.guest_pc = m_current_instruction->pc;
bpi.fault_count = 0;
void* host_pc = GetCurrentNearCodePointer();
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::MMap)
{
@ -2137,7 +2411,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
temp_address.SetHostReg(&m_register_cache, RRETURN, RegSize_32);
actual_address = &temp_address;
m_emit->mov(GetHostReg32(temp_address), address.constant_value);
bpi.host_pc = GetCurrentNearCodePointer();
host_pc = GetCurrentNearCodePointer();
}
m_register_cache.InhibitAllocation();
@ -2252,7 +2526,7 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
m_emit->and_(GetHostReg32(RARG2), Bus::FASTMEM_LUT_PAGE_MASK);
m_emit->mov(GetHostReg64(RARG1),
m_emit->qword[GetFastmemBasePtrReg() + GetHostReg64(RARG1) * 8 + (Bus::FASTMEM_LUT_NUM_PAGES * 8)]);
bpi.host_pc = GetCurrentNearCodePointer();
host_pc = GetCurrentNearCodePointer();
switch (size)
{
@ -2290,24 +2564,23 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
}
// insert nops, we need at least 5 bytes for a relative jump
const u32 fastmem_size =
static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc));
const u32 fastmem_size = static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc));
const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0);
for (u32 i = 0; i < nops; i++)
m_emit->nop();
bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
const u32 host_code_size =
static_cast<u32>(static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(host_pc)));
// generate slowmem fallback
m_far_emitter.align();
bpi.host_slowmem_pc = GetCurrentFarCodePointer();
const void* host_thunk_pc = GetCurrentFarCodePointer();
SwitchToFarCode();
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
EmitStoreGuestMemorySlowmem(cbi, address, size, value, true);
EmitStoreGuestMemorySlowmem(instruction, info, address, size, value, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
@ -2318,11 +2591,12 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
SwitchToNearCode();
m_register_cache.UninhibitAllocation();
m_block->loadstore_backpatch_info.push_back(bpi);
CPU::CodeCache::AddLoadStoreInfo(host_pc, host_code_size, info.pc, host_thunk_pc);
}
void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
const Value& value, bool in_far_code)
void CodeGenerator::EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info,
const Value& address, RegSize size, const Value& value,
bool in_far_code)
{
if (g_settings.cpu_recompiler_memory_exceptions)
{
@ -2360,8 +2634,8 @@ void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi,
// cause_bits = (result << 2) | BD | cop_n
m_emit->shl(GetHostReg32(result), 2);
m_emit->or_(GetHostReg32(result),
Cop0Registers::CAUSE::MakeValueForException(static_cast<Exception>(0), cbi.is_branch_delay_slot, false,
cbi.instruction.cop.cop_n));
Cop0Registers::CAUSE::MakeValueForException(static_cast<Exception>(0), info.is_branch_delay_slot, false,
instruction.cop.cop_n));
EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
EmitExceptionExit();
@ -2398,55 +2672,21 @@ void CodeGenerator::EmitUpdateFastmemBase()
m_emit->mov(GetFastmemBasePtrReg(), m_emit->qword[GetCPUPtrReg() + offsetof(CPU::State, fastmem_base)]);
}
bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi)
void CodeGenerator::BackpatchLoadStore(void* host_pc, const CodeCache::LoadstoreBackpatchInfo& lbi)
{
Log_ProfilePrintf("Backpatching %p (guest PC 0x%08X) to slowmem", lbi.host_pc, lbi.guest_pc);
Log_ProfileFmt("Backpatching {} (guest PC 0x{:08X}) to slowmem", host_pc, lbi.guest_pc);
// turn it into a jump to the slowmem handler
Xbyak::CodeGenerator cg(lbi.host_code_size, lbi.host_pc);
cg.jmp(lbi.host_slowmem_pc);
Xbyak::CodeGenerator cg(lbi.code_size, host_pc);
cg.jmp(lbi.thunk_address);
const s32 nops = static_cast<s32>(lbi.host_code_size) -
static_cast<s32>(static_cast<ptrdiff_t>(cg.getCurr() - static_cast<u8*>(lbi.host_pc)));
const s32 nops = static_cast<s32>(lbi.code_size) -
static_cast<s32>(static_cast<ptrdiff_t>(cg.getCurr() - static_cast<u8*>(host_pc)));
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
cg.nop();
JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size);
return true;
}
void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size)
{
Log_ProfilePrintf("Backpatching %p to return", pc);
Xbyak::CodeGenerator cg(pc_size, pc);
cg.ret();
const s32 nops =
static_cast<s32>(pc_size) - static_cast<s32>(static_cast<ptrdiff_t>(cg.getCurr() - static_cast<u8*>(pc)));
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
cg.nop();
JitCodeBuffer::FlushInstructionCache(pc, pc_size);
}
void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target)
{
Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target);
Xbyak::CodeGenerator cg(pc_size, pc);
cg.jmp(target);
// shouldn't have any nops
const s32 nops =
static_cast<s32>(pc_size) - static_cast<s32>(static_cast<ptrdiff_t>(cg.getCurr() - static_cast<u8*>(pc)));
Assert(nops >= 0);
for (s32 i = 0; i < nops; i++)
cg.nop();
JitCodeBuffer::FlushInstructionCache(pc, pc_size);
JitCodeBuffer::FlushInstructionCache(host_pc, lbi.code_size);
}
void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)
@ -2737,6 +2977,62 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
}
}
void CodeGenerator::EmitBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
{
const auto ram_ptr_reg = GetHostReg64(RARG1);
const auto shadow_ptr_reg = GetHostReg64(RARG2);
const auto temp_reg = GetHostReg64(RARG3);
const auto temp_reg32 = GetHostReg32(RARG3);
// store it first to reduce code size, because we can offset
m_emit->mov(ram_ptr_reg, static_cast<size_t>(reinterpret_cast<uintptr_t>(ram_ptr)));
m_emit->mov(shadow_ptr_reg, static_cast<size_t>(reinterpret_cast<uintptr_t>(shadow_ptr)));
bool first = true;
u32 offset = 0;
while (size >= 16)
{
const Xbyak::Xmm& dst = first ? m_emit->xmm0 : m_emit->xmm1;
m_emit->movups(dst, m_emit->xword[ram_ptr_reg + offset]);
m_emit->pcmpeqd(dst, m_emit->xword[shadow_ptr_reg + offset]);
if (!first)
m_emit->pand(m_emit->xmm0, dst);
else
first = false;
offset += 16;
size -= 16;
}
// TODO: better codegen for 16 byte aligned blocks
if (!first)
{
m_emit->movmskps(temp_reg32, m_emit->xmm0);
m_emit->cmp(temp_reg32, 0xf);
m_emit->jne(CodeCache::g_discard_and_recompile_block);
}
while (size >= 8)
{
m_emit->mov(temp_reg, m_emit->qword[ram_ptr_reg + offset]);
m_emit->cmp(temp_reg, m_emit->qword[shadow_ptr_reg + offset]);
m_emit->jne(CodeCache::g_discard_and_recompile_block);
offset += 8;
size -= 8;
}
while (size >= 4)
{
m_emit->mov(temp_reg32, m_emit->dword[ram_ptr_reg + offset]);
m_emit->cmp(temp_reg32, m_emit->dword[shadow_ptr_reg + offset]);
m_emit->jne(CodeCache::g_discard_and_recompile_block);
offset += 4;
size -= 4;
}
DebugAssert(size == 0);
}
void CodeGenerator::EmitStallUntilGTEComplete()
{
m_emit->mov(GetHostReg32(RRETURN), m_emit->dword[GetCPUPtrReg() + offsetof(State, pending_ticks)]);
@ -2759,7 +3055,7 @@ void CodeGenerator::EmitBranch(const void* address, bool allow_scratch)
static_cast<s64>(reinterpret_cast<intptr_t>(address) - reinterpret_cast<intptr_t>(GetCurrentCodePointer()));
if (Xbyak::inner::IsInInt32(static_cast<u64>(jump_distance)))
{
m_emit->jmp(address);
m_emit->jmp(address, Xbyak::CodeGenerator::T_NEAR);
return;
}
@ -3068,77 +3364,4 @@ void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr)
else
m_emit->mov(GetHostReg64(host_reg), reinterpret_cast<size_t>(ptr));
}
CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher()
{
m_register_cache.ReserveCalleeSavedRegisters();
const u32 stack_adjust = PrepareStackForCall();
EmitLoadGlobalAddress(Xbyak::Operand::RBP, &g_state);
Xbyak::Label event_test;
m_emit->jmp(event_test);
// main dispatch loop
Xbyak::Label main_loop;
m_emit->align(16);
m_emit->L(main_loop);
// time to lookup the block
// eax <- pc
m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, pc)]);
// rcx <- s_fast_map[pc >> 16]
EmitLoadGlobalAddress(Xbyak::Operand::RBX, CodeCache::GetFastMapPointer());
m_emit->mov(m_emit->ecx, m_emit->eax);
m_emit->shr(m_emit->ecx, 16);
m_emit->mov(m_emit->rcx, m_emit->qword[m_emit->rbx + m_emit->rcx * 8]);
// call(rcx[pc * 2]) (fast_map[pc >> 2])
m_emit->call(m_emit->qword[m_emit->rcx + m_emit->rax * 2]);
// eax <- pending_ticks
m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, pending_ticks)]);
// while eax < downcount
Xbyak::Label downcount_hit;
m_emit->cmp(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, downcount)]);
m_emit->jl(main_loop);
m_emit->L(event_test);
EmitCall(reinterpret_cast<const void*>(&TimingEvents::RunEvents));
m_emit->jmp(main_loop);
// all done
RestoreStackAfterCall(stack_adjust);
m_register_cache.PopCalleeSavedRegisters(true);
m_emit->ret();
CodeBlock::HostCodePointer ptr;
u32 code_size;
FinalizeBlock(&ptr, &code_size);
Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr);
return ptr;
}
CodeCache::SingleBlockDispatcherFunction CodeGenerator::CompileSingleBlockDispatcher()
{
m_register_cache.ReserveCalleeSavedRegisters();
const u32 stack_adjust = PrepareStackForCall();
EmitLoadGlobalAddress(Xbyak::Operand::RBP, &g_state);
m_emit->call(GetHostReg64(RARG1));
RestoreStackAfterCall(stack_adjust);
m_register_cache.PopCalleeSavedRegisters(true);
m_emit->ret();
CodeBlock::HostCodePointer ptr;
u32 code_size;
FinalizeBlock(&ptr, &code_size);
Log_DevPrintf("Single block dispatcher is %u bytes at %p", code_size, ptr);
return reinterpret_cast<CodeCache::SingleBlockDispatcherFunction>(ptr);
}
} // namespace CPU::Recompiler

View file

@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
@ -6,6 +6,12 @@
#include "cpu_recompiler_types.h"
#include "cpu_types.h"
#if defined(CPU_ARCH_ARM32)
#include "vixl/aarch32/macro-assembler-aarch32.h"
#elif defined(CPU_ARCH_ARM64)
#include "vixl/aarch64/macro-assembler-aarch64.h"
#endif
#include <array>
#include <optional>
#include <stack>
@ -13,6 +19,59 @@
namespace CPU::Recompiler {
enum RegSize : u8
{
RegSize_8,
RegSize_16,
RegSize_32,
RegSize_64,
};
#if defined(CPU_ARCH_X64)
using HostReg = unsigned;
using CodeEmitter = Xbyak::CodeGenerator;
using LabelType = Xbyak::Label;
enum : u32
{
HostReg_Count = 16
};
constexpr HostReg HostReg_Invalid = static_cast<HostReg>(HostReg_Count);
constexpr RegSize HostPointerSize = RegSize_64;
#elif defined(CPU_ARCH_ARM32)
using HostReg = unsigned;
using CodeEmitter = vixl::aarch32::MacroAssembler;
using LabelType = vixl::aarch32::Label;
enum : u32
{
HostReg_Count = vixl::aarch32::kNumberOfRegisters
};
constexpr HostReg HostReg_Invalid = static_cast<HostReg>(HostReg_Count);
constexpr RegSize HostPointerSize = RegSize_32;
#elif defined(CPU_ARCH_ARM64)
using HostReg = unsigned;
using CodeEmitter = vixl::aarch64::MacroAssembler;
using LabelType = vixl::aarch64::Label;
enum : u32
{
HostReg_Count = vixl::aarch64::kNumberOfRegisters
};
constexpr HostReg HostReg_Invalid = static_cast<HostReg>(HostReg_Count);
constexpr RegSize HostPointerSize = RegSize_64;
#else
#error Unknown architecture.
#endif
class CodeGenerator;
class RegisterCache;
enum class HostRegState : u8
{
None = 0,

View file

@ -1,15 +1,11 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#pragma once
#include "cpu_code_cache.h"
#include "cpu_types.h"
namespace CPU {
struct CodeBlock;
struct CodeBlockInstruction;
namespace Recompiler::Thunks {
namespace CPU::Recompiler::Thunks {
//////////////////////////////////////////////////////////////////////////
// Trampolines for calling back from the JIT
@ -18,7 +14,6 @@ namespace Recompiler::Thunks {
//////////////////////////////////////////////////////////////////////////
bool InterpretInstruction();
bool InterpretInstructionPGXP();
void CheckAndUpdateICache(u32 pc, u32 line_count);
// Memory access functions for the JIT - MSB is set on exception.
u64 ReadMemoryByte(u32 address);
@ -36,9 +31,6 @@ void UncheckedWriteMemoryByte(u32 address, u32 value);
void UncheckedWriteMemoryHalfWord(u32 address, u32 value);
void UncheckedWriteMemoryWord(u32 address, u32 value);
void ResolveBranch(CodeBlock* block, void* host_pc, void* host_resolve_pc, u32 host_pc_size);
void LogPC(u32 pc);
} // namespace Recompiler::Thunks
} // namespace CPU
} // namespace CPU::Recompiler::Thunks

View file

@ -1,6 +1,8 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
// Shared code between recompiler backends.
#pragma once
#include "cpu_types.h"
@ -14,158 +16,118 @@
#define XBYAK_NO_OP_NAMES 1
#include "xbyak.h"
#elif defined(CPU_ARCH_ARM32)
#include "vixl/aarch32/constants-aarch32.h"
#include "vixl/aarch32/instructions-aarch32.h"
#include "vixl/aarch32/macro-assembler-aarch32.h"
#elif defined(CPU_ARCH_ARM64)
#include "vixl/aarch64/constants-aarch64.h"
#include "vixl/aarch64/macro-assembler-aarch64.h"
#endif
namespace CPU {
namespace Recompiler {
class CodeGenerator;
class RegisterCache;
enum RegSize : u8
{
RegSize_8,
RegSize_16,
RegSize_32,
RegSize_64,
};
enum class Condition : u8
{
Always,
NotEqual,
Equal,
Overflow,
Greater,
GreaterEqual,
LessEqual,
Less,
Negative,
PositiveOrZero,
Above, // unsigned variant of Greater
AboveEqual, // unsigned variant of GreaterEqual
Below, // unsigned variant of Less
BelowEqual, // unsigned variant of LessEqual
NotZero,
Zero
};
#if defined(CPU_ARCH_X64)
using HostReg = unsigned;
using CodeEmitter = Xbyak::CodeGenerator;
using LabelType = Xbyak::Label;
enum : u32
{
HostReg_Count = 16
};
constexpr HostReg HostReg_Invalid = static_cast<HostReg>(HostReg_Count);
constexpr RegSize HostPointerSize = RegSize_64;
namespace CPU::Recompiler {
// A reasonable "maximum" number of bytes per instruction.
constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64;
constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128;
// Alignment of code stoarge.
constexpr u32 CODE_STORAGE_ALIGNMENT = 4096;
// ABI selection
#if defined(_WIN32)
#define ABI_WIN64 1
#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__HAIKU__) || defined(__FreeBSD__)
#define RWRET Xbyak::Reg32(Xbyak::Operand::EAX)
#define RWARG1 Xbyak::Reg32(Xbyak::Operand::RCX)
#define RWARG2 Xbyak::Reg32(Xbyak::Operand::RDX)
#define RWARG3 Xbyak::Reg32(Xbyak::Operand::R8D)
#define RWARG4 Xbyak::Reg32(Xbyak::Operand::R9D)
#define RXRET Xbyak::Reg64(Xbyak::Operand::RAX)
#define RXARG1 Xbyak::Reg64(Xbyak::Operand::RCX)
#define RXARG2 Xbyak::Reg64(Xbyak::Operand::RDX)
#define RXARG3 Xbyak::Reg64(Xbyak::Operand::R8)
#define RXARG4 Xbyak::Reg64(Xbyak::Operand::R9)
static constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32;
#elif defined(__linux__) || defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__)
#define ABI_SYSV 1
#define RWRET Xbyak::Reg32(Xbyak::Operand::EAX)
#define RWARG1 Xbyak::Reg32(Xbyak::Operand::EDI)
#define RWARG2 Xbyak::Reg32(Xbyak::Operand::ESI)
#define RWARG3 Xbyak::Reg32(Xbyak::Operand::EDX)
#define RWARG4 Xbyak::Reg32(Xbyak::Operand::ECX)
#define RXRET Xbyak::Reg64(Xbyak::Operand::RAX)
#define RXARG1 Xbyak::Reg64(Xbyak::Operand::RDI)
#define RXARG2 Xbyak::Reg64(Xbyak::Operand::RSI)
#define RXARG3 Xbyak::Reg64(Xbyak::Operand::RDX)
#define RXARG4 Xbyak::Reg64(Xbyak::Operand::RCX)
static constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 0;
#else
#error Unknown ABI.
#endif
bool IsCallerSavedRegister(u32 id);
} // namespace CPU::Recompiler
#elif defined(CPU_ARCH_ARM32)
using HostReg = unsigned;
using CodeEmitter = vixl::aarch32::MacroAssembler;
using LabelType = vixl::aarch32::Label;
enum : u32
{
HostReg_Count = vixl::aarch32::kNumberOfRegisters
};
constexpr HostReg HostReg_Invalid = static_cast<HostReg>(HostReg_Count);
constexpr RegSize HostPointerSize = RegSize_32;
#include "vixl/aarch32/assembler-aarch32.h"
#include "vixl/aarch32/constants-aarch32.h"
#include "vixl/aarch32/instructions-aarch32.h"
namespace CPU::Recompiler {
// A reasonable "maximum" number of bytes per instruction.
constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64;
constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128;
// Alignment of code stoarge.
constexpr u32 CODE_STORAGE_ALIGNMENT = 4096;
#define RRET vixl::aarch32::r0
#define RARG1 vixl::aarch32::r0
#define RARG2 vixl::aarch32::r1
#define RARG3 vixl::aarch32::r2
#define RARG4 vixl::aarch32::r3
#define RSCRATCH vixl::aarch32::r12
#define RSTATE vixl::aarch32::r4
#define RMEMBASE vixl::aarch32::r5
s32 armGetPCDisplacement(const void* current, const void* target);
bool armIsPCDisplacementInImmediateRange(s32 displacement);
void armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr);
void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm);
void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline);
void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline);
} // namespace CPU::Recompiler
#elif defined(CPU_ARCH_ARM64)
using HostReg = unsigned;
using CodeEmitter = vixl::aarch64::MacroAssembler;
using LabelType = vixl::aarch64::Label;
enum : u32
{
HostReg_Count = vixl::aarch64::kNumberOfRegisters
};
constexpr HostReg HostReg_Invalid = static_cast<HostReg>(HostReg_Count);
constexpr RegSize HostPointerSize = RegSize_64;
#include "vixl/aarch64/assembler-aarch64.h"
#include "vixl/aarch64/constants-aarch64.h"
namespace CPU::Recompiler {
// A reasonable "maximum" number of bytes per instruction.
constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64;
constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128;
// Alignment of code stoarge.
constexpr u32 CODE_STORAGE_ALIGNMENT = 4096;
#define RWRET vixl::aarch64::w0
#define RXRET vixl::aarch64::x0
#define RWARG1 vixl::aarch64::w0
#define RXARG1 vixl::aarch64::x0
#define RWARG2 vixl::aarch64::w1
#define RXARG2 vixl::aarch64::x1
#define RWARG3 vixl::aarch64::w2
#define RXARG3 vixl::aarch64::x2
#define RWARG4 vixl::aarch64::w3
#define RXARG4 vixl::aarch64::x3
#define RWSCRATCH vixl::aarch64::w16
#define RXSCRATCH vixl::aarch64::x16
#define RSTATE vixl::aarch64::x19
#define RMEMBASE vixl::aarch64::x20
#elif defined(CPU_ARCH_RISCV64)
bool armIsCallerSavedRegister(u32 id);
s64 armGetPCDisplacement(const void* current, const void* target);
void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::XRegister& reg, const void* addr);
void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);
void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);
u8* armGetJumpTrampoline(const void* target);
using HostReg = unsigned;
// Alignment of code stoarge.
constexpr u32 CODE_STORAGE_ALIGNMENT = 4096;
#else
using HostReg = int;
class CodeEmitter
{
};
enum : u32
{
HostReg_Count = 1
};
constexpr HostReg HostReg_Invalid = static_cast<HostReg>(HostReg_Count);
constexpr RegSize HostPointerSize = RegSize_64;
constexpr bool SHIFTS_ARE_IMPLICITLY_MASKED = false;
} // namespace CPU::Recompiler
#endif
struct LoadStoreBackpatchInfo
{
void* host_pc; // pointer to instruction which will fault
void* host_slowmem_pc; // pointer to slowmem callback code
u32 host_code_size; // size of the fastmem load as well as the add for cycles
HostReg address_host_reg; // register containing the guest address to load/store
HostReg value_host_reg; // register containing the source/destination
PhysicalMemoryAddress guest_pc;
u32 fault_count;
};
} // namespace Recompiler
} // namespace CPU

View file

@ -315,8 +315,7 @@ DEFINE_HOTKEY("TogglePGXP", TRANSLATE_NOOP("Hotkeys", "Graphics"), TRANSLATE_NOO
PGXP::Shutdown();
// we need to recompile all blocks if pgxp is toggled on/off
if (g_settings.IsUsingCodeCache())
CPU::CodeCache::Flush();
CPU::CodeCache::Reset();
// need to swap interpreters
System::InterruptExecution();
@ -407,8 +406,7 @@ DEFINE_HOTKEY("TogglePGXPCPU", TRANSLATE_NOOP("Hotkeys", "Graphics"), TRANSLATE_
PGXP::Initialize();
// we need to recompile all blocks if pgxp is toggled on/off
if (g_settings.IsUsingCodeCache())
CPU::CodeCache::Flush();
CPU::CodeCache::Reset();
}
})

View file

@ -349,8 +349,9 @@ void ImGuiManager::DrawPerformanceOverlay()
System::GetMaximumFrameTime());
DRAW_LINE(fixed_font, text, IM_COL32(255, 255, 255, 255));
if (g_settings.cpu_overclock_active || (!g_settings.IsUsingRecompiler() || g_settings.cpu_recompiler_icache ||
g_settings.cpu_recompiler_memory_exceptions))
if (g_settings.cpu_overclock_active ||
(g_settings.cpu_execution_mode != CPUExecutionMode::Recompiler || g_settings.cpu_recompiler_icache ||
g_settings.cpu_recompiler_memory_exceptions))
{
first = true;
text.assign("CPU[");

View file

@ -254,8 +254,6 @@ struct Settings
bool log_to_window = false;
bool log_to_file = false;
ALWAYS_INLINE bool IsUsingCodeCache() const { return (cpu_execution_mode != CPUExecutionMode::Interpreter); }
ALWAYS_INLINE bool IsUsingRecompiler() const { return (cpu_execution_mode == CPUExecutionMode::Recompiler); }
ALWAYS_INLINE bool IsUsingSoftwareRenderer() const { return (gpu_renderer == GPURenderer::Software); }
ALWAYS_INLINE bool IsRunaheadEnabled() const { return (runahead_frames > 0); }
@ -275,12 +273,6 @@ struct Settings
gpu_pgxp_depth_clear_threshold = value / GPU_PGXP_DEPTH_THRESHOLD_SCALE;
}
ALWAYS_INLINE bool IsUsingFastmem() const
{
return (cpu_fastmem_mode != CPUFastmemMode::Disabled && cpu_execution_mode == CPUExecutionMode::Recompiler &&
!cpu_recompiler_memory_exceptions);
}
ALWAYS_INLINE s32 GetAudioOutputVolume(bool fast_forwarding) const
{
return audio_output_muted ? 0 : (fast_forwarding ? audio_fast_forward_volume : audio_output_volume);

View file

@ -244,6 +244,8 @@ void System::Internal::ProcessStartup()
if (!Bus::AllocateMemory())
Panic("Failed to allocate memory for emulated bus.");
CPU::CodeCache::ProcessStartup();
// This will call back to Host::LoadSettings() -> ReloadSources().
LoadSettings(false);
@ -265,6 +267,7 @@ void System::Internal::ProcessShutdown()
InputManager::CloseSources();
CPU::CodeCache::ProcessShutdown();
Bus::ReleaseMemory();
}
@ -1508,6 +1511,8 @@ bool System::Initialize(bool force_software_renderer)
return false;
}
CPU::CodeCache::Initialize();
if (!CreateGPU(force_software_renderer ? GPURenderer::Software : g_settings.gpu_renderer, false))
{
Bus::Shutdown();
@ -1536,9 +1541,6 @@ bool System::Initialize(bool force_software_renderer)
return false;
}
// CPU code cache must happen after GPU, because it might steal our address space.
CPU::CodeCache::Initialize();
DMA::Initialize();
InterruptController::Initialize();
@ -1704,6 +1706,7 @@ void System::Execute()
// TODO: Purge reset/restore
g_gpu->RestoreDeviceContext();
TimingEvents::UpdateCPUDowncount();
if (s_rewind_load_counter >= 0)
DoRewind();
@ -2037,9 +2040,9 @@ bool System::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
if (sw.IsReading())
{
if (is_memory_state)
CPU::CodeCache::InvalidateAll();
CPU::CodeCache::InvalidateAllRAMBlocks();
else
CPU::CodeCache::Flush();
CPU::CodeCache::Reset();
}
// only reset pgxp if we're not runahead-rollbacking. the value checks will save us from broken rendering, and it
@ -2158,7 +2161,7 @@ void System::InternalReset()
return;
CPU::Reset();
CPU::CodeCache::Flush();
CPU::CodeCache::Reset();
if (g_settings.gpu_pgxp_enable)
PGXP::Initialize();
@ -3522,7 +3525,10 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
g_settings.cpu_execution_mode))),
5.0f);
CPU::ExecutionModeChanged();
CPU::CodeCache::Reinitialize();
if (old_settings.cpu_execution_mode != CPUExecutionMode::Interpreter)
CPU::CodeCache::Shutdown();
if (g_settings.cpu_execution_mode != CPUExecutionMode::Interpreter)
CPU::CodeCache::Initialize();
CPU::ClearICache();
}
@ -3534,12 +3540,7 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
{
Host::AddOSDMessage(TRANSLATE_STR("OSDMessage", "Recompiler options changed, flushing all blocks."), 5.0f);
CPU::ExecutionModeChanged();
// changing memory exceptions can re-enable fastmem
if (g_settings.cpu_recompiler_memory_exceptions != old_settings.cpu_recompiler_memory_exceptions)
CPU::CodeCache::Reinitialize();
else
CPU::CodeCache::Flush();
CPU::CodeCache::Reset();
if (g_settings.cpu_recompiler_icache != old_settings.cpu_recompiler_icache)
CPU::ClearICache();
@ -3597,20 +3598,13 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
g_settings.gpu_pgxp_vertex_cache != old_settings.gpu_pgxp_vertex_cache ||
g_settings.gpu_pgxp_cpu != old_settings.gpu_pgxp_cpu)))
{
if (g_settings.IsUsingCodeCache())
{
Host::AddOSDMessage(g_settings.gpu_pgxp_enable ?
TRANSLATE_STR("OSDMessage", "PGXP enabled, recompiling all blocks.") :
TRANSLATE_STR("OSDMessage", "PGXP disabled, recompiling all blocks."),
5.0f);
CPU::CodeCache::Flush();
}
if (old_settings.gpu_pgxp_enable)
PGXP::Shutdown();
if (g_settings.gpu_pgxp_enable)
PGXP::Initialize();
CPU::CodeCache::Reset();
}
if (g_settings.cdrom_readahead_sectors != old_settings.cdrom_readahead_sectors)

View file

@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
#include "page_fault_handler.h"
@ -28,24 +28,12 @@ struct RegisteredHandler
{
Callback callback;
const void* owner;
void* start_pc;
u32 code_size;
};
static std::vector<RegisteredHandler> m_handlers;
static std::mutex m_handler_lock;
static thread_local bool s_in_handler;
#if defined(CPU_ARCH_ARM32)
static bool IsStoreInstruction(const void* ptr)
{
u32 bits;
std::memcpy(&bits, ptr, sizeof(bits));
// TODO
return false;
}
#elif defined(CPU_ARCH_ARM64)
#if defined(CPU_ARCH_ARM64)
static bool IsStoreInstruction(const void* ptr)
{
u32 bits;
@ -146,7 +134,7 @@ static void SIGSEGVHandler(int sig, siginfo_t* info, void* ctx)
const bool is_write = (static_cast<ucontext_t*>(ctx)->uc_mcontext.gregs[REG_ERR] & 2) != 0;
#elif defined(CPU_ARCH_ARM32)
void* const exception_pc = reinterpret_cast<void*>(static_cast<ucontext_t*>(ctx)->uc_mcontext.arm_pc);
const bool is_write = IsStoreInstruction(exception_pc);
const bool is_write = (static_cast<ucontext_t*>(ctx)->uc_mcontext.error_code & (1 << 11)) != 0; // DFSR.WnR
#elif defined(CPU_ARCH_ARM64)
void* const exception_pc = reinterpret_cast<void*>(static_cast<ucontext_t*>(ctx)->uc_mcontext.pc);
const bool is_write = IsStoreInstruction(exception_pc);
@ -221,7 +209,7 @@ static void SIGSEGVHandler(int sig, siginfo_t* info, void* ctx)
#endif
bool InstallHandler(const void* owner, void* start_pc, u32 code_size, Callback callback)
bool InstallHandler(const void* owner, Callback callback)
{
bool was_empty;
{
@ -267,7 +255,7 @@ bool InstallHandler(const void* owner, void* start_pc, u32 code_size, Callback c
#endif
}
m_handlers.push_back(RegisteredHandler{callback, owner, start_pc, code_size});
m_handlers.push_back(RegisteredHandler{callback, owner});
return true;
}

View file

@ -14,7 +14,7 @@ enum class HandlerResult
using Callback = HandlerResult (*)(void* exception_pc, void* fault_address, bool is_write);
using Handle = void*;
bool InstallHandler(const void* owner, void* start_pc, u32 code_size, Callback callback);
bool InstallHandler(const void* owner, Callback callback);
bool RemoveHandler(const void* owner);
} // namespace Common::PageFaultHandler