CPU/Recompiler: Simplify fast map addressing

This commit is contained in:
Connor McLaughlin 2021-07-20 12:33:37 +10:00
parent 7f88cd5f9f
commit 033d85cd90
5 changed files with 187 additions and 79 deletions

View file

@ -46,29 +46,141 @@ alignas(Recompiler::CODE_STORAGE_ALIGNMENT) static u8
#endif
static JitCodeBuffer s_code_buffer;
static FastMapTable s_fast_map[FAST_MAP_TABLE_COUNT];
static std::unique_ptr<CodeBlock::HostCodePointer[]> s_fast_map_pointers;
std::array<CodeBlock::HostCodePointer, FAST_MAP_TOTAL_SLOT_COUNT> s_fast_map;
DispatcherFunction s_asm_dispatcher;
SingleBlockDispatcherFunction s_single_block_asm_dispatcher;
ALWAYS_INLINE static u32 GetFastMapIndex(u32 pc)
static FastMapTable DecodeFastMapPointer(u32 slot, FastMapTable ptr)
{
return ((pc & PHYSICAL_MEMORY_ADDRESS_MASK) >= Bus::BIOS_BASE) ?
(FAST_MAP_RAM_SLOT_COUNT + ((pc & Bus::BIOS_MASK) >> 2)) :
((pc & Bus::g_ram_mask) >> 2);
if constexpr (sizeof(void*) == 8)
return reinterpret_cast<FastMapTable>(reinterpret_cast<u8*>(ptr) + (static_cast<u64>(slot) << 17));
else
return reinterpret_cast<FastMapTable>(reinterpret_cast<u8*>(ptr) + (slot << 16));
}
static FastMapTable EncodeFastMapPointer(u32 slot, FastMapTable ptr)
{
if constexpr (sizeof(void*) == 8)
return reinterpret_cast<FastMapTable>(reinterpret_cast<u8*>(ptr) - (static_cast<u64>(slot) << 17));
else
return reinterpret_cast<FastMapTable>(reinterpret_cast<u8*>(ptr) - (slot << 16));
}
static CodeBlock::HostCodePointer* OffsetFastMapPointer(FastMapTable fake_ptr, u32 pc)
{
u8* fake_byte_ptr = reinterpret_cast<u8*>(fake_ptr);
if constexpr (sizeof(void*) == 8)
return reinterpret_cast<CodeBlock::HostCodePointer*>(fake_byte_ptr + (static_cast<u64>(pc) << 1));
else
return reinterpret_cast<CodeBlock::HostCodePointer*>(fake_byte_ptr + pc);
}
static void CompileDispatcher();
static void FastCompileBlockFunction();
static void InvalidCodeFunction();
static constexpr u32 GetTableCount(u32 start, u32 end)
{
return ((end >> FAST_MAP_TABLE_SHIFT) - (start >> FAST_MAP_TABLE_SHIFT)) + 1;
}
static void AllocateFastMapTables(u32 start, u32 end, FastMapTable& table_ptr)
{
const u32 start_slot = start >> FAST_MAP_TABLE_SHIFT;
const u32 count = GetTableCount(start, end);
for (u32 i = 0; i < count; i++)
{
const u32 slot = start_slot + i;
s_fast_map[slot] = EncodeFastMapPointer(slot, table_ptr);
table_ptr += FAST_MAP_TABLE_SIZE;
}
}
static void AllocateFastMap()
{
static constexpr VirtualMemoryAddress ranges[][2] = {
{0x00000000, 0x00800000}, // RAM
{0x1F000000, 0x1F800000}, // EXP1
{0x1FC00000, 0x1FC80000}, // BIOS
{0x80000000, 0x80800000}, // RAM
{0x9F000000, 0x9F800000}, // EXP1
{0x9FC00000, 0x9FC80000}, // BIOS
{0xA0000000, 0xA0800000}, // RAM
{0xBF000000, 0xBF800000}, // EXP1
{0xBFC00000, 0xBFC80000} // BIOS
};
u32 num_tables = 1; // unreachable table
for (u32 i = 0; i < countof(ranges); i++)
num_tables += GetTableCount(ranges[i][0], ranges[i][1]);
const u32 num_slots = FAST_MAP_TABLE_SIZE * num_tables;
if (!s_fast_map_pointers)
s_fast_map_pointers = std::make_unique<CodeBlock::HostCodePointer[]>(num_slots);
FastMapTable table_ptr = s_fast_map_pointers.get();
FastMapTable table_ptr_end = table_ptr + num_slots;
// Fill the first table with invalid/unreachable.
for (u32 i = 0; i < FAST_MAP_TABLE_SIZE; i++)
table_ptr[i] = InvalidCodeFunction;
// And the remaining with block compile pointers.
for (u32 i = FAST_MAP_TABLE_SIZE; i < num_slots; i++)
table_ptr[i] = FastCompileBlockFunction;
// Mark everything as unreachable to begin with.
for (u32 i = 0; i < FAST_MAP_TABLE_COUNT; i++)
s_fast_map[i] = EncodeFastMapPointer(i, table_ptr);
table_ptr += FAST_MAP_TABLE_SIZE;
// Allocate ranges.
for (u32 i = 0; i < countof(ranges); i++)
AllocateFastMapTables(ranges[i][0], ranges[i][1], table_ptr);
Assert(table_ptr == table_ptr_end);
}
static void ResetFastMap()
{
s_fast_map.fill(FastCompileBlockFunction);
if (!s_fast_map_pointers)
return;
for (u32 i = 0; i < FAST_MAP_TABLE_COUNT; i++)
{
FastMapTable ptr = DecodeFastMapPointer(i, s_fast_map[i]);
if (ptr == s_fast_map_pointers.get())
continue;
for (u32 j = 0; j < FAST_MAP_TABLE_SIZE; j++)
ptr[j] = FastCompileBlockFunction;
}
}
static void FreeFastMap()
{
std::memset(s_fast_map, 0, sizeof(s_fast_map));
s_fast_map_pointers.reset();
}
static void SetFastMap(u32 pc, CodeBlock::HostCodePointer function)
{
s_fast_map[GetFastMapIndex(pc)] = function;
if (!s_fast_map_pointers)
return;
const u32 slot = pc >> FAST_MAP_TABLE_SHIFT;
FastMapTable encoded_ptr = s_fast_map[slot];
const FastMapTable table_ptr = DecodeFastMapPointer(slot, encoded_ptr);
Assert(table_ptr != nullptr && table_ptr != s_fast_map_pointers.get());
CodeBlock::HostCodePointer* ptr = OffsetFastMapPointer(encoded_ptr, pc);
*ptr = function;
}
#endif
@ -138,11 +250,13 @@ void Initialize()
Panic("Failed to initialize code space");
}
AllocateFastMap();
if (g_settings.IsUsingFastmem() && !InitializeFastmem())
Panic("Failed to initialize fastmem");
ResetFastMap();
CompileDispatcher();
ResetFastMap();
}
#endif
}
@ -169,6 +283,7 @@ void Shutdown()
ClearState();
#ifdef WITH_RECOMPILER
ShutdownFastmem();
FreeFastMap();
s_code_buffer.Destroy();
#endif
}
@ -305,9 +420,9 @@ void CompileDispatcher()
s_code_buffer.WriteProtect(true);
}
CodeBlock::HostCodePointer* GetFastMapPointer()
FastMapTable* GetFastMapPointer()
{
return s_fast_map.data();
return s_fast_map;
}
void ExecuteRecompiler()
@ -334,8 +449,7 @@ void ExecuteRecompiler()
const u32 pc = g_state.regs.pc;
g_state.current_instruction_pc = pc;
const u32 fast_map_index = GetFastMapIndex(pc);
s_single_block_asm_dispatcher(s_fast_map[fast_map_index]);
s_single_block_asm_dispatcher(s_fast_map[pc >> 16][pc >> 2]);
}
TimingEvents::RunEvents();
@ -503,7 +617,7 @@ recompile:
if (block->recompile_count >= RECOMPILE_COUNT_TO_FALL_BACK_TO_INTERPRETER)
{
Log_PerfPrintf("Block 0x%08X has been recompiled %u times in %u frames, falling back to interpreter",
block->GetPC(), block->recompile_count, frame_diff);
block->GetPC(), block->recompile_count, frame_diff);
FallbackExistingBlockToInterpreter(block);
return false;
@ -683,11 +797,36 @@ void FastCompileBlockFunction()
{
CodeBlock* block = LookupBlock(GetNextBlockKey());
if (block)
{
s_single_block_asm_dispatcher(block->host_code);
}
else if (g_settings.gpu_pgxp_enable)
InterpretUncachedBlock<PGXPMode::Memory>();
{
if (g_settings.gpu_pgxp_cpu)
InterpretUncachedBlock<PGXPMode::CPU>();
else
InterpretUncachedBlock<PGXPMode::Memory>();
}
else
{
InterpretUncachedBlock<PGXPMode::Disabled>();
}
}
void InvalidCodeFunction()
{
Log_ErrorPrintf("Trying to execute invalid code at 0x%08X", g_state.regs.pc);
if (g_settings.gpu_pgxp_enable)
{
if (g_settings.gpu_pgxp_cpu)
InterpretUncachedBlock<PGXPMode::CPU>();
else
InterpretUncachedBlock<PGXPMode::Memory>();
}
else
{
InterpretUncachedBlock<PGXPMode::Disabled>();
}
}
#endif

View file

@ -16,13 +16,6 @@
namespace CPU {
enum : u32
{
FAST_MAP_RAM_SLOT_COUNT = Bus::RAM_8MB_SIZE / 4,
FAST_MAP_BIOS_SLOT_COUNT = Bus::BIOS_SIZE / 4,
FAST_MAP_TOTAL_SLOT_COUNT = FAST_MAP_RAM_SLOT_COUNT + FAST_MAP_BIOS_SLOT_COUNT,
};
union CodeBlockKey
{
u32 bits;
@ -107,6 +100,15 @@ struct CodeBlock
namespace CodeCache {
enum : u32
{
FAST_MAP_TABLE_COUNT = 0x10000,
FAST_MAP_TABLE_SIZE = 0x10000 / 4, // 16384
FAST_MAP_TABLE_SHIFT = 16,
};
using FastMapTable = CodeBlock::HostCodePointer*;
void Initialize();
void Shutdown();
void Execute();
@ -115,7 +117,7 @@ void Execute();
using DispatcherFunction = void (*)();
using SingleBlockDispatcherFunction = void(*)(const CodeBlock::HostCodePointer);
CodeBlock::HostCodePointer* GetFastMapPointer();
FastMapTable* GetFastMapPointer();
void ExecuteRecompiler();
#endif

View file

@ -2028,29 +2028,18 @@ CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher()
// time to lookup the block
// r0 <- pc
m_emit->Mov(a32::r3, Bus::BIOS_BASE);
m_emit->ldr(a32::r0, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, regs.pc)));
// current_instruction_pc <- pc (eax)
// r1 <- s_fast_map[pc >> 16]
EmitLoadGlobalAddress(2, CodeCache::GetFastMapPointer());
m_emit->lsr(a32::r1, a32::r0, 16);
m_emit->ldr(a32::r1, a32::MemOperand(a32::r2, a32::r1, a32::LSL, 2));
// current_instruction_pc <- pc (r0)
m_emit->str(a32::r0, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, current_instruction_pc)));
// r1 <- (pc & RAM_MASK) >> 2
m_emit->and_(a32::r1, a32::r0, Bus::g_ram_mask);
m_emit->lsr(a32::r1, a32::r1, 2);
// r2 <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT
m_emit->and_(a32::r2, a32::r0, Bus::BIOS_MASK);
m_emit->lsr(a32::r2, a32::r2, 2);
m_emit->add(a32::r2, a32::r2, FAST_MAP_RAM_SLOT_COUNT);
// if ((r0 (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use r2 as index }
m_emit->and_(a32::r0, a32::r0, PHYSICAL_MEMORY_ADDRESS_MASK);
m_emit->cmp(a32::r0, a32::r3);
m_emit->mov(a32::ge, a32::r1, a32::r2);
// ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue
EmitLoadGlobalAddress(0, CodeCache::GetFastMapPointer());
m_emit->ldr(a32::r0, a32::MemOperand(a32::r0, a32::r1, a32::LSL, 2));
// blr(r1[pc]) (fast_map[pc >> 2])
m_emit->ldr(a32::r0, a32::MemOperand(a32::r1, a32::r0));
m_emit->blx(a32::r0);
// end while

View file

@ -2239,29 +2239,18 @@ CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher()
// time to lookup the block
// w8 <- pc
m_emit->Mov(a64::w11, Bus::BIOS_BASE);
m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, regs.pc)));
// current_instruction_pc <- pc (eax)
// x9 <- s_fast_map[pc >> 16]
EmitLoadGlobalAddress(10, CodeCache::GetFastMapPointer());
m_emit->lsr(a64::w9, a64::w8, 16);
m_emit->ldr(a64::x9, a64::MemOperand(a64::x10, a64::x9, a64::LSL, 3));
// current_instruction_pc <- pc (w8)
m_emit->str(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, current_instruction_pc)));
// w9 <- (pc & RAM_MASK) >> 2
m_emit->and_(a64::w9, a64::w8, Bus::g_ram_mask);
m_emit->lsr(a64::w9, a64::w9, 2);
// w10 <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT
m_emit->and_(a64::w10, a64::w8, Bus::BIOS_MASK);
m_emit->lsr(a64::w10, a64::w10, 2);
m_emit->add(a64::w10, a64::w10, FAST_MAP_RAM_SLOT_COUNT);
// if ((w8 (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use w10 as index }
m_emit->and_(a64::w8, a64::w8, PHYSICAL_MEMORY_ADDRESS_MASK);
m_emit->cmp(a64::w8, a64::w11);
m_emit->csel(a64::w8, a64::w9, a64::w10, a64::lt);
// ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue
EmitLoadGlobalAddress(9, CodeCache::GetFastMapPointer());
m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 3));
// blr(x9[pc * 2]) (fast_map[pc >> 2])
m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 2));
m_emit->blr(a64::x8);
// end while

View file

@ -2996,29 +2996,18 @@ CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher()
// eax <- pc
m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, regs.pc)]);
// ebx <- (pc & RAM_MASK) >> 2
m_emit->mov(m_emit->ebx, m_emit->eax);
m_emit->and_(m_emit->ebx, Bus::g_ram_mask);
m_emit->shr(m_emit->ebx, 2);
// ecx <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT
m_emit->mov(m_emit->ecx, m_emit->eax);
m_emit->and_(m_emit->ecx, Bus::BIOS_MASK);
m_emit->shr(m_emit->ecx, 2);
m_emit->add(m_emit->ecx, FAST_MAP_RAM_SLOT_COUNT);
// current_instruction_pc <- pc (eax)
m_emit->mov(m_emit->dword[m_emit->rbp + offsetof(State, current_instruction_pc)], m_emit->eax);
// if ((eax (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use ecx as index }
m_emit->and_(m_emit->eax, PHYSICAL_MEMORY_ADDRESS_MASK);
m_emit->cmp(m_emit->eax, Bus::BIOS_BASE);
m_emit->cmovge(m_emit->ebx, m_emit->ecx);
// rcx <- s_fast_map[pc >> 16]
EmitLoadGlobalAddress(Xbyak::Operand::RBX, CodeCache::GetFastMapPointer());
m_emit->mov(m_emit->ecx, m_emit->eax);
m_emit->shr(m_emit->ecx, 16);
m_emit->mov(m_emit->rcx, m_emit->qword[m_emit->rbx + m_emit->rcx * 8]);
// call(rcx[pc * 2]) (fast_map[pc >> 2])
m_emit->call(m_emit->qword[m_emit->rcx + m_emit->rax * 2]);
// ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue
EmitLoadGlobalAddress(Xbyak::Operand::RAX, CodeCache::GetFastMapPointer());
m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax + m_emit->rbx * 8]);
m_emit->call(m_emit->rax);
m_emit->jmp(main_loop);
// end while