From a6f8dde790bb3550ef00be310feb0137da1139a0 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 18 Oct 2020 14:43:09 +1000 Subject: [PATCH] CPU/Recompiler: Faster ASM dispatcher --- src/core/bus.h | 13 - src/core/cpu_code_cache.cpp | 35 ++- src/core/cpu_code_cache.h | 24 ++ src/core/cpu_core.cpp | 1 + src/core/cpu_recompiler_code_generator.h | 7 + .../cpu_recompiler_code_generator_aarch64.cpp | 265 +++++++++++++----- .../cpu_recompiler_code_generator_x64.cpp | 216 +++++++++++--- src/core/cpu_recompiler_register_cache.cpp | 71 ++++- src/core/cpu_recompiler_register_cache.h | 6 + src/core/dma.cpp | 5 +- src/core/timing_event.cpp | 5 + src/core/timing_event.h | 2 + 12 files changed, 513 insertions(+), 137 deletions(-) diff --git a/src/core/bus.h b/src/core/bus.h index 10c44f90e..d2f187ba6 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -1,6 +1,5 @@ #pragma once #include "common/bitfield.h" -#include "cpu_code_cache.h" #include "types.h" #include #include @@ -97,16 +96,4 @@ ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count) return static_cast(word_count + ((word_count + 15) / 16)); } -/// Invalidates any code pages which overlap the specified range. -ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count) -{ - const u32 start_page = address / CPU_CODE_CACHE_PAGE_SIZE; - const u32 end_page = (address + word_count * sizeof(u32)) / CPU_CODE_CACHE_PAGE_SIZE; - for (u32 page = start_page; page <= end_page; page++) - { - if (m_ram_code_bits[page]) - CPU::CodeCache::InvalidateBlocksWithPageIndex(page); - } -} - } // namespace Bus diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index f6bf291f2..7e8db76f5 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -35,14 +35,9 @@ alignas(Recompiler::CODE_STORAGE_ALIGNMENT) static u8 static JitCodeBuffer s_code_buffer; -enum : u32 -{ - FAST_MAP_RAM_SLOT_COUNT = Bus::RAM_SIZE / 4, - FAST_MAP_BIOS_SLOT_COUNT = Bus::BIOS_SIZE / 4, - FAST_MAP_TOTAL_SLOT_COUNT = FAST_MAP_RAM_SLOT_COUNT + FAST_MAP_BIOS_SLOT_COUNT, -}; - std::array s_fast_map; +DispatcherFunction s_asm_dispatcher; +SingleBlockDispatcherFunction s_single_block_asm_dispatcher; ALWAYS_INLINE static u32 GetFastMapIndex(u32 pc) { @@ -51,6 +46,7 @@ ALWAYS_INLINE static u32 GetFastMapIndex(u32 pc) ((pc & Bus::RAM_MASK) >> 2); } +static void CompileDispatcher(); static void FastCompileBlockFunction(); static void ResetFastMap() @@ -111,6 +107,7 @@ void Initialize(bool use_recompiler) } ResetFastMap(); + CompileDispatcher(); #else s_use_recompiler = false; #endif @@ -238,9 +235,27 @@ void Execute() #ifdef WITH_RECOMPILER +void CompileDispatcher() +{ + { + Recompiler::CodeGenerator cg(&s_code_buffer); + s_asm_dispatcher = cg.CompileDispatcher(); + } + { + Recompiler::CodeGenerator cg(&s_code_buffer); + s_single_block_asm_dispatcher = cg.CompileSingleBlockDispatcher(); + } +} + +CodeBlock::HostCodePointer* GetFastMapPointer() +{ + return s_fast_map.data(); +} + void ExecuteRecompiler() { g_state.frame_done = false; +#if 0 while (!g_state.frame_done) { if (HasPendingInterrupt()) @@ -261,6 +276,9 @@ void ExecuteRecompiler() TimingEvents::RunEvents(); } +#else + s_asm_dispatcher(); +#endif // in case we switch to interpreter... g_state.regs.npc = g_state.regs.pc; @@ -291,6 +309,7 @@ void Flush() #ifdef WITH_RECOMPILER s_code_buffer.Reset(); ResetFastMap(); + CompileDispatcher(); #endif } @@ -499,7 +518,7 @@ void FastCompileBlockFunction() { CodeBlock* block = LookupBlock(GetNextBlockKey()); if (block) - block->host_code(); + s_single_block_asm_dispatcher(block->host_code); else InterpretUncachedBlock(); } diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index 068e6706e..6d09a8c0d 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -1,4 +1,5 @@ #pragma once +#include "bus.h" #include "common/bitfield.h" #include "common/jit_code_buffer.h" #include "cpu_types.h" @@ -9,6 +10,13 @@ namespace CPU { +enum : u32 +{ + FAST_MAP_RAM_SLOT_COUNT = Bus::RAM_SIZE / 4, + FAST_MAP_BIOS_SLOT_COUNT = Bus::BIOS_SIZE / 4, + FAST_MAP_TOTAL_SLOT_COUNT = FAST_MAP_RAM_SLOT_COUNT + FAST_MAP_BIOS_SLOT_COUNT, +}; + union CodeBlockKey { u32 bits; @@ -86,6 +94,10 @@ void Shutdown(); void Execute(); #ifdef WITH_RECOMPILER +using DispatcherFunction = void (*)(); +using SingleBlockDispatcherFunction = void(*)(const CodeBlock::HostCodePointer); + +CodeBlock::HostCodePointer* GetFastMapPointer(); void ExecuteRecompiler(); #endif @@ -102,6 +114,18 @@ template void InterpretCachedBlock(const CodeBlock& block); void InterpretUncachedBlock(); +/// Invalidates any code pages which overlap the specified range. +ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count) +{ + const u32 start_page = address / CPU_CODE_CACHE_PAGE_SIZE; + const u32 end_page = (address + word_count * sizeof(u32)) / CPU_CODE_CACHE_PAGE_SIZE; + for (u32 page = start_page; page <= end_page; page++) + { + if (Bus::m_ram_code_bits[page]) + CPU::CodeCache::InvalidateBlocksWithPageIndex(page); + } +} + }; // namespace CodeCache } // namespace CPU diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 7bf72a92d..b433ff3ff 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -1381,6 +1381,7 @@ void DispatchInterrupt() { // If the instruction we're about to execute is a GTE instruction, delay dispatching the interrupt until the next // instruction. For some reason, if we don't do this, we end up with incorrectly sorted polygons and flickering.. + SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits); if (g_state.next_instruction.op == InstructionOp::cop2 && !g_state.next_instruction.cop.IsCommonInstruction()) GTE::ExecuteInstruction(g_state.next_instruction.bits); diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 438786bd3..be07f25e8 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -25,6 +25,9 @@ public: bool CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); + CodeCache::DispatcherFunction CompileDispatcher(); + CodeCache::SingleBlockDispatcherFunction CompileSingleBlockDispatcher(); + ////////////////////////////////////////////////////////////////////////// // Code Generation ////////////////////////////////////////////////////////////////////////// @@ -67,6 +70,7 @@ public: void EmitAddCPUStructField(u32 offset, const Value& value); void EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr); void EmitStoreGlobal(void* ptr, const Value& value); + void EmitLoadGlobalAddress(HostReg host_reg, const void* ptr); // Automatically generates an exception handler. Value EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, RegSize size); @@ -86,6 +90,7 @@ public: u32 PrepareStackForCall(); void RestoreStackAfterCall(u32 adjust_size); + void EmitCall(const void* ptr); void EmitFunctionCallPtr(Value* return_value, const void* ptr); void EmitFunctionCallPtr(Value* return_value, const void* ptr, const Value& arg1); void EmitFunctionCallPtr(Value* return_value, const void* ptr, const Value& arg1, const Value& arg2); @@ -128,7 +133,9 @@ public: // Host register saving. void EmitPushHostReg(HostReg reg, u32 position); + void EmitPushHostRegPair(HostReg reg, HostReg reg2, u32 position); void EmitPopHostReg(HostReg reg, u32 position); + void EmitPopHostRegPair(HostReg reg, HostReg reg2, u32 position); // Value ops Value AddValues(const Value& lhs, const Value& rhs, bool set_flags); diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 7d772c099..e9fb87263 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -6,6 +6,7 @@ #include "cpu_recompiler_code_generator.h" #include "cpu_recompiler_thunks.h" #include "settings.h" +#include "timing_event.h" Log_SetChannel(CPU::Recompiler); namespace a64 = vixl::aarch64; @@ -26,6 +27,16 @@ constexpr u64 FUNCTION_CALLER_SAVED_SPACE_RESERVE = 144; // 18 registers -> 224 constexpr u64 FUNCTION_STACK_SIZE = FUNCTION_CALLEE_SAVED_SPACE_RESERVE + FUNCTION_CALLER_SAVED_SPACE_RESERVE + FUNCTION_CALL_SHADOW_SPACE; +// PC we return to after the end of the block +static void* s_dispatcher_return_address; + +static s64 GetPCDisplacement(const void* current, const void* target) +{ + Assert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); + Assert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); + return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); +} + static const a64::WRegister GetHostReg8(HostReg reg) { return a64::WRegister(reg); @@ -172,11 +183,11 @@ void CodeGenerator::EmitBeginBlock() // Save the link register, since we'll be calling functions. const bool link_reg_allocated = m_register_cache.AllocateHostReg(30); DebugAssert(link_reg_allocated); + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); // Store the CPU struct pointer. TODO: make this better. const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); - m_emit->Mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); } void CodeGenerator::EmitEndBlock() @@ -185,6 +196,7 @@ void CodeGenerator::EmitEndBlock() m_register_cache.PopCalleeSavedRegisters(true); m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); m_emit->Ret(); } @@ -200,6 +212,7 @@ void CodeGenerator::EmitExceptionExit() m_register_cache.PopCalleeSavedRegisters(false); m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); m_emit->Ret(); } @@ -228,10 +241,10 @@ void CodeGenerator::FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32 m_far_emitter.FinalizeCode(); *out_host_code = reinterpret_cast(m_code_buffer->GetFreeCodePointer()); - *out_host_code_size = m_near_emitter.GetSizeOfCodeGenerated(); + *out_host_code_size = static_cast(m_near_emitter.GetSizeOfCodeGenerated()); - m_code_buffer->CommitCode(m_near_emitter.GetSizeOfCodeGenerated()); - m_code_buffer->CommitFarCode(m_far_emitter.GetSizeOfCodeGenerated()); + m_code_buffer->CommitCode(static_cast(m_near_emitter.GetSizeOfCodeGenerated())); + m_code_buffer->CommitFarCode(static_cast(m_far_emitter.GetSizeOfCodeGenerated())); m_near_emitter.Reset(); m_far_emitter.Reset(); @@ -958,11 +971,19 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size) m_register_cache.PopCallerSavedRegisters(); } -static s64 GetBranchDisplacement(const void* current, const void* target) +void CodeGenerator::EmitCall(const void* ptr) { - Assert(Common::IsAlignedPow2(reinterpret_cast(current), 4)); - Assert(Common::IsAlignedPow2(reinterpret_cast(target), 4)); - return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); + const s64 displacement = GetPCDisplacement(GetCurrentCodePointer(), ptr); + const bool use_blr = !vixl::IsInt26(displacement); + if (use_blr) + { + m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + m_emit->Blr(GetHostReg64(RSCRATCH)); + } + else + { + m_emit->bl(displacement); + } } void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) @@ -974,17 +995,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) const u32 adjust_size = PrepareStackForCall(); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); - const bool use_blr = !vixl::IsInt26(displacement); - if (use_blr) - { - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); - m_emit->Blr(GetHostReg64(RSCRATCH)); - } - else - { - m_emit->bl(displacement); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1009,17 +1020,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG1, arg1); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); - const bool use_blr = !vixl::IsInt26(displacement); - if (use_blr) - { - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); - m_emit->Blr(GetHostReg64(RSCRATCH)); - } - else - { - m_emit->bl(displacement); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1045,17 +1046,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG2, arg2); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); - const bool use_blr = !vixl::IsInt26(displacement); - if (use_blr) - { - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); - m_emit->Blr(GetHostReg64(RSCRATCH)); - } - else - { - m_emit->bl(displacement); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1083,17 +1074,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG3, arg3); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); - const bool use_blr = !vixl::IsInt26(displacement); - if (use_blr) - { - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); - m_emit->Blr(GetHostReg64(RSCRATCH)); - } - else - { - m_emit->bl(displacement); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1122,17 +1103,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG4, arg4); // actually call the function - const s64 displacement = GetBranchDisplacement(GetCurrentCodePointer(), ptr); - const bool use_blr = !vixl::IsInt26(displacement); - if (use_blr) - { - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); - m_emit->Blr(GetHostReg64(RSCRATCH)); - } - else - { - m_emit->bl(displacement); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1148,13 +1119,25 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co void CodeGenerator::EmitPushHostReg(HostReg reg, u32 position) { const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8)); - m_emit->Str(GetHostReg64(reg), addr); + m_emit->str(GetHostReg64(reg), addr); +} + +void CodeGenerator::EmitPushHostRegPair(HostReg reg, HostReg reg2, u32 position) +{ + const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - ((position + 1) * 8)); + m_emit->stp(GetHostReg64(reg2), GetHostReg64(reg), addr); } void CodeGenerator::EmitPopHostReg(HostReg reg, u32 position) { const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8)); - m_emit->Ldr(GetHostReg64(reg), addr); + m_emit->ldr(GetHostReg64(reg), addr); +} + +void CodeGenerator::EmitPopHostRegPair(HostReg reg, HostReg reg2, u32 position) +{ + const a64::MemOperand addr(a64::sp, FUNCTION_STACK_SIZE - FUNCTION_CALL_SHADOW_SPACE - (position * 8)); + m_emit->ldp(GetHostReg64(reg2), GetHostReg64(reg), addr); } void CodeGenerator::EmitLoadCPUStructField(HostReg host_reg, RegSize guest_size, u32 offset) @@ -1510,7 +1493,7 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + EmitLoadGlobalAddress(RSCRATCH, ptr); switch (size) { case RegSize_8: @@ -1535,7 +1518,7 @@ void CodeGenerator::EmitStoreGlobal(void* ptr, const Value& value) { Value value_in_hr = GetValueInHostRegister(value); - m_emit->Mov(GetHostReg64(RSCRATCH), reinterpret_cast(ptr)); + EmitLoadGlobalAddress(RSCRATCH, ptr); switch (value.size) { case RegSize_8: @@ -1882,4 +1865,152 @@ void CodeGenerator::EmitBindLabel(LabelType* label) m_emit->Bind(label); } +void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) +{ + const void* current_code_ptr_page = reinterpret_cast( + reinterpret_cast(GetCurrentCodePointer()) & ~static_cast(0xFFF)); + const void* ptr_page = + reinterpret_cast(reinterpret_cast(ptr) & ~static_cast(0xFFF)); + const s64 page_displacement = GetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; + const u32 page_offset = static_cast(reinterpret_cast(ptr) & 0xFFFu); + if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64)) + { + m_emit->adrp(GetHostReg64(host_reg), page_displacement); + m_emit->orr(GetHostReg64(host_reg), GetHostReg64(host_reg), page_offset); + } + else + { + m_emit->Mov(GetHostReg64(host_reg), reinterpret_cast(ptr)); + } +} + +CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher() +{ + m_emit->sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + m_register_cache.ReserveCalleeSavedRegisters(); + const u32 stack_adjust = PrepareStackForCall(); + + EmitLoadGlobalAddress(RCPUPTR, &g_state); + + a64::Label frame_done_loop; + a64::Label exit_dispatcher; + m_emit->Bind(&frame_done_loop); + + // if frame_done goto exit_dispatcher + m_emit->ldrb(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, frame_done))); + m_emit->tbnz(a64::w8, 0, &exit_dispatcher); + + // x8 <- sr + a64::Label no_interrupt; + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, cop0_regs.sr.bits))); + + // if Iec == 0 then goto no_interrupt + m_emit->tbz(a64::w8, 0, &no_interrupt); + + // x9 <- cause + // x8 (sr) & cause + m_emit->ldr(a64::w9, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, cop0_regs.cause.bits))); + m_emit->and_(a64::w8, a64::w8, a64::w9); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + m_emit->tst(a64::w8, 0xFF00); + m_emit->b(&no_interrupt, a64::eq); + + // we have an interrupt + EmitCall(reinterpret_cast(&DispatchInterrupt)); + + // no interrupt or we just serviced it + m_emit->Bind(&no_interrupt); + + // TimingEvents::UpdateCPUDowncount: + // x8 <- head event->downcount + // downcount <- x8 + EmitLoadGlobalAddress(8, TimingEvents::GetHeadEventPtr()); + m_emit->ldr(a64::x8, a64::MemOperand(a64::x8)); + m_emit->ldr(a64::w8, a64::MemOperand(a64::x8, offsetof(TimingEvent, m_downcount))); + m_emit->str(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, downcount))); + + // main dispatch loop + a64::Label main_loop; + m_emit->Bind(&main_loop); + s_dispatcher_return_address = GetCurrentCodePointer(); + + // w8 <- pending_ticks + // w9 <- downcount + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, pending_ticks))); + m_emit->ldr(a64::w9, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, downcount))); + + // while downcount < pending_ticks + a64::Label downcount_hit; + m_emit->cmp(a64::w8, a64::w9); + m_emit->b(&downcount_hit, a64::ge); + + // time to lookup the block + // w8 <- pc + m_emit->Mov(a64::w11, Bus::BIOS_BASE); + m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, regs.pc))); + + // current_instruction_pc <- pc (eax) + m_emit->str(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, current_instruction_pc))); + + // w9 <- (pc & RAM_MASK) >> 2 + m_emit->and_(a64::w9, a64::w8, Bus::RAM_MASK); + m_emit->lsr(a64::w9, a64::w9, 2); + + // w10 <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT + m_emit->and_(a64::w10, a64::w8, Bus::BIOS_MASK); + m_emit->lsr(a64::w10, a64::w10, 2); + m_emit->add(a64::w10, a64::w10, FAST_MAP_RAM_SLOT_COUNT); + + // if ((w8 (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use w10 as index } + m_emit->and_(a64::w8, a64::w8, PHYSICAL_MEMORY_ADDRESS_MASK); + m_emit->cmp(a64::w8, a64::w11); + m_emit->csel(a64::w8, a64::w9, a64::w10, a64::lt); + + // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue + EmitLoadGlobalAddress(9, CodeCache::GetFastMapPointer()); + m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 3)); + m_emit->blr(a64::x8); + + // end while + m_emit->Bind(&downcount_hit); + + // check events then for frame done + EmitCall(reinterpret_cast(&TimingEvents::RunEvents)); + m_emit->b(&frame_done_loop); + + // all done + m_emit->Bind(&exit_dispatcher); + RestoreStackAfterCall(stack_adjust); + m_register_cache.PopCalleeSavedRegisters(true); + m_emit->add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + m_emit->ret(); + + CodeBlock::HostCodePointer ptr; + u32 code_size; + FinalizeBlock(&ptr, &code_size); + Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr); + return reinterpret_cast(ptr); +} + +CodeCache::SingleBlockDispatcherFunction CodeGenerator::CompileSingleBlockDispatcher() +{ + m_emit->sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + m_register_cache.ReserveCalleeSavedRegisters(); + const u32 stack_adjust = PrepareStackForCall(); + + m_emit->blr(GetHostReg64(RARG1)); + + RestoreStackAfterCall(stack_adjust); + m_register_cache.PopCalleeSavedRegisters(true); + m_emit->add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); + m_emit->ret(); + + CodeBlock::HostCodePointer ptr; + u32 code_size; + FinalizeBlock(&ptr, &code_size); + Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr); + return reinterpret_cast(ptr); +} + } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 142f86fad..04d9a8134 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1,9 +1,12 @@ #include "common/align.h" +#include "common/log.h" #include "cpu_core.h" #include "cpu_core_private.h" #include "cpu_recompiler_code_generator.h" #include "cpu_recompiler_thunks.h" #include "settings.h" +#include "timing_event.h" +Log_SetChannel(Recompiler::CodeGenerator); namespace CPU::Recompiler { @@ -187,10 +190,12 @@ Value CodeGenerator::GetValueInHostRegister(const Value& value, bool allow_zero_ void CodeGenerator::EmitBeginBlock() { + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); + // Store the CPU struct pointer. const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); DebugAssert(cpu_reg_allocated); - m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); + // m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); } void CodeGenerator::EmitEndBlock() @@ -1392,15 +1397,8 @@ void CodeGenerator::RestoreStackAfterCall(u32 adjust_size) m_register_cache.PopCallerSavedRegisters(); } -void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) +void CodeGenerator::EmitCall(const void* ptr) { - if (return_value) - return_value->Discard(); - - // shadow space allocate - const u32 adjust_size = PrepareStackForCall(); - - // actually call the function if (Xbyak::inner::IsInInt32(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr()))) { m_emit->call(ptr); @@ -1410,6 +1408,18 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) m_emit->mov(GetHostReg64(RRETURN), reinterpret_cast(ptr)); m_emit->call(GetHostReg64(RRETURN)); } +} + +void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr) +{ + if (return_value) + return_value->Discard(); + + // shadow space allocate + const u32 adjust_size = PrepareStackForCall(); + + // actually call the function + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1434,15 +1444,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG1, arg1); // actually call the function - if (Xbyak::inner::IsInInt32(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr()))) - { - m_emit->call(ptr); - } - else - { - m_emit->mov(GetHostReg64(RRETURN), reinterpret_cast(ptr)); - m_emit->call(GetHostReg64(RRETURN)); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1468,15 +1470,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG2, arg2); // actually call the function - if (Xbyak::inner::IsInInt32(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr()))) - { - m_emit->call(ptr); - } - else - { - m_emit->mov(GetHostReg64(RRETURN), reinterpret_cast(ptr)); - m_emit->call(GetHostReg64(RRETURN)); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1504,15 +1498,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG3, arg3); // actually call the function - if (Xbyak::inner::IsInInt32(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr()))) - { - m_emit->call(ptr); - } - else - { - m_emit->mov(GetHostReg64(RRETURN), reinterpret_cast(ptr)); - m_emit->call(GetHostReg64(RRETURN)); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1541,15 +1527,7 @@ void CodeGenerator::EmitFunctionCallPtr(Value* return_value, const void* ptr, co EmitCopyValue(RARG4, arg4); // actually call the function - if (Xbyak::inner::IsInInt32(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr()))) - { - m_emit->call(ptr); - } - else - { - m_emit->mov(GetHostReg64(RRETURN), reinterpret_cast(ptr)); - m_emit->call(GetHostReg64(RRETURN)); - } + EmitCall(ptr); // shadow space release RestoreStackAfterCall(adjust_size); @@ -1567,11 +1545,23 @@ void CodeGenerator::EmitPushHostReg(HostReg reg, u32 position) m_emit->push(GetHostReg64(reg)); } +void CodeGenerator::EmitPushHostRegPair(HostReg reg, HostReg reg2, u32 position) +{ + m_emit->push(GetHostReg64(reg)); + m_emit->push(GetHostReg64(reg2)); +} + void CodeGenerator::EmitPopHostReg(HostReg reg, u32 position) { m_emit->pop(GetHostReg64(reg)); } +void CodeGenerator::EmitPopHostRegPair(HostReg reg, HostReg reg2, u32 position) +{ + m_emit->pop(GetHostReg64(reg2)); + m_emit->pop(GetHostReg64(reg)); +} + void CodeGenerator::EmitLoadCPUStructField(HostReg host_reg, RegSize guest_size, u32 offset) { switch (guest_size) @@ -2516,4 +2506,140 @@ void CodeGenerator::EmitBindLabel(LabelType* label) m_emit->L(*label); } +void CodeGenerator::EmitLoadGlobalAddress(HostReg host_reg, const void* ptr) +{ + const s64 displacement = + static_cast(reinterpret_cast(ptr) - reinterpret_cast(m_emit->getCurr())) + 2; + if (Xbyak::inner::IsInInt32(static_cast(displacement))) + m_emit->lea(GetHostReg64(host_reg), m_emit->dword[m_emit->rip + ptr]); + else + m_emit->mov(GetHostReg64(host_reg), reinterpret_cast(ptr)); +} + +CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher() +{ + m_register_cache.ReserveCalleeSavedRegisters(); + const u32 stack_adjust = PrepareStackForCall(); + + EmitLoadGlobalAddress(Xbyak::Operand::RBP, &g_state); + + Xbyak::Label frame_done_loop; + Xbyak::Label exit_dispatcher; + m_emit->L(frame_done_loop); + + // if frame_done goto exit_dispatcher + m_emit->test(m_emit->byte[m_emit->rbp + offsetof(State, frame_done)], 1); + m_emit->jnz(exit_dispatcher, Xbyak::CodeGenerator::T_NEAR); + + // eax <- sr + Xbyak::Label no_interrupt; + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, cop0_regs.sr.bits)]); + + // if Iec == 0 then goto no_interrupt + m_emit->test(m_emit->eax, 1); + m_emit->jz(no_interrupt); + + // sr & cause + m_emit->and_(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, cop0_regs.cause.bits)]); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + m_emit->test(m_emit->eax, 0xFF00); + m_emit->jz(no_interrupt); + + // we have an interrupt + EmitCall(reinterpret_cast(&DispatchInterrupt)); + + // no interrupt or we just serviced it + m_emit->L(no_interrupt); + + // TimingEvents::UpdateCPUDowncount: + // eax <- head event->downcount + // downcount <- eax + EmitLoadGlobalAddress(Xbyak::Operand::RAX, TimingEvents::GetHeadEventPtr()); + m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax]); + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rax + offsetof(TimingEvent, m_downcount)]); + m_emit->mov(m_emit->dword[m_emit->rbp + offsetof(State, downcount)], m_emit->eax); + + // main dispatch loop + Xbyak::Label main_loop; + m_emit->align(16); + m_emit->L(main_loop); + + // eax <- pending_ticks + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, pending_ticks)]); + + // while eax < downcount + Xbyak::Label downcount_hit; + m_emit->cmp(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, downcount)]); + m_emit->jge(downcount_hit); + + // time to lookup the block + // eax <- pc + m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, regs.pc)]); + + // ebx <- (pc & RAM_MASK) >> 2 + m_emit->mov(m_emit->ebx, m_emit->eax); + m_emit->and_(m_emit->ebx, Bus::RAM_MASK); + m_emit->shr(m_emit->ebx, 2); + + // ecx <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT + m_emit->mov(m_emit->ecx, m_emit->eax); + m_emit->and_(m_emit->ecx, Bus::BIOS_MASK); + m_emit->shr(m_emit->ecx, 2); + m_emit->add(m_emit->ecx, FAST_MAP_RAM_SLOT_COUNT); + + // current_instruction_pc <- pc (eax) + m_emit->mov(m_emit->dword[m_emit->rbp + offsetof(State, current_instruction_pc)], m_emit->eax); + + // if ((eax (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use ecx as index } + m_emit->and_(m_emit->eax, PHYSICAL_MEMORY_ADDRESS_MASK); + m_emit->cmp(m_emit->eax, Bus::BIOS_BASE); + m_emit->cmovge(m_emit->ebx, m_emit->ecx); + + // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue + EmitLoadGlobalAddress(Xbyak::Operand::RAX, CodeCache::GetFastMapPointer()); + m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax + m_emit->rbx * 8]); + m_emit->call(m_emit->rax); + m_emit->jmp(main_loop); + + // end while + m_emit->L(downcount_hit); + + // check events then for frame done + EmitCall(reinterpret_cast(&TimingEvents::RunEvents)); + m_emit->jmp(frame_done_loop); + + // all done + m_emit->L(exit_dispatcher); + RestoreStackAfterCall(stack_adjust); + m_register_cache.PopCalleeSavedRegisters(true); + m_emit->ret(); + + CodeBlock::HostCodePointer ptr; + u32 code_size; + FinalizeBlock(&ptr, &code_size); + Log_DevPrintf("Dispatcher is %u bytes at %p", code_size, ptr); + return ptr; +} + +CodeCache::SingleBlockDispatcherFunction CodeGenerator::CompileSingleBlockDispatcher() +{ + m_register_cache.ReserveCalleeSavedRegisters(); + const u32 stack_adjust = PrepareStackForCall(); + + EmitLoadGlobalAddress(Xbyak::Operand::RBP, &g_state); + + m_emit->call(GetHostReg64(RARG1)); + + RestoreStackAfterCall(stack_adjust); + m_register_cache.PopCalleeSavedRegisters(true); + m_emit->ret(); + + CodeBlock::HostCodePointer ptr; + u32 code_size; + FinalizeBlock(&ptr, &code_size); + Log_DevPrintf("Single block dispatcher is %u bytes at %p", code_size, ptr); + return reinterpret_cast(ptr); +} + } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_register_cache.cpp b/src/core/cpu_recompiler_register_cache.cpp index dac853c81..00e0d3281 100644 --- a/src/core/cpu_recompiler_register_cache.cpp +++ b/src/core/cpu_recompiler_register_cache.cpp @@ -318,8 +318,25 @@ u32 RegisterCache::PopCallerSavedRegisters() const if ((m_state.host_reg_state[i] & (HostRegState::CallerSaved | HostRegState::InUse | HostRegState::Discarded)) == (HostRegState::CallerSaved | HostRegState::InUse)) { - m_code_generator.EmitPopHostReg(static_cast(i), position); - position--; + u32 reg_pair; + for (reg_pair = (i - 1); reg_pair > 0 && reg_pair < HostReg_Count; reg_pair--) + { + if ((m_state.host_reg_state[reg_pair] & + (HostRegState::CallerSaved | HostRegState::InUse | HostRegState::Discarded)) == + (HostRegState::CallerSaved | HostRegState::InUse)) + { + m_code_generator.EmitPopHostRegPair(static_cast(reg_pair), static_cast(i), position); + position -= 2; + i = reg_pair; + break; + } + } + + if (reg_pair == 0) + { + m_code_generator.EmitPopHostReg(static_cast(i), position); + position--; + } } i--; } while (i > 0); @@ -351,6 +368,56 @@ u32 RegisterCache::PopCalleeSavedRegisters(bool commit) return count; } +void RegisterCache::ReserveCalleeSavedRegisters() +{ + for (u32 reg = 0; reg < HostReg_Count; reg++) + { + if ((m_state.host_reg_state[reg] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + DebugAssert(m_state.callee_saved_order_count < HostReg_Count); + + // can we find a paired register? (mainly for ARM) + u32 reg_pair; + for (reg_pair = reg + 1; reg < HostReg_Count; reg_pair++) + { + if ((m_state.host_reg_state[reg_pair] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + m_code_generator.EmitPushHostRegPair(static_cast(reg), static_cast(reg_pair), + GetActiveCalleeSavedRegisterCount()); + + m_state.callee_saved_order[m_state.callee_saved_order_count++] = static_cast(reg); + m_state.host_reg_state[reg] |= HostRegState::CalleeSavedAllocated; + m_state.callee_saved_order[m_state.callee_saved_order_count++] = static_cast(reg_pair); + m_state.host_reg_state[reg_pair] |= HostRegState::CalleeSavedAllocated; + reg = reg_pair; + break; + } + } + + if (reg_pair == HostReg_Count) + { + m_code_generator.EmitPushHostReg(static_cast(reg), GetActiveCalleeSavedRegisterCount()); + m_state.callee_saved_order[m_state.callee_saved_order_count++] = static_cast(reg); + m_state.host_reg_state[reg] |= HostRegState::CalleeSavedAllocated; + } + } + } +} + +void RegisterCache::AssumeCalleeSavedRegistersAreSaved() +{ + for (u32 i = 0; i < HostReg_Count; i++) + { + if ((m_state.host_reg_state[i] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + m_state.host_reg_state[i] &= ~HostRegState::CalleeSaved; + } + } +} + void RegisterCache::PushState() { // need to copy this manually because of the load delay values diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h index 0c989f296..c5c3cb4da 100644 --- a/src/core/cpu_recompiler_register_cache.h +++ b/src/core/cpu_recompiler_register_cache.h @@ -248,6 +248,12 @@ public: /// Restore callee-saved registers. Call at the end of the function. u32 PopCalleeSavedRegisters(bool commit); + /// Preallocates caller saved registers, enabling later use without stack pushes. + void ReserveCalleeSavedRegisters(); + + /// Removes the callee-saved register flag from all registers. Call when compiling code blocks. + void AssumeCalleeSavedRegistersAreSaved(); + /// Pushes the register allocator state, use when entering branched code. void PushState(); diff --git a/src/core/dma.cpp b/src/core/dma.cpp index d685f5158..e4168e88b 100644 --- a/src/core/dma.cpp +++ b/src/core/dma.cpp @@ -4,6 +4,7 @@ #include "common/log.h" #include "common/state_wrapper.h" #include "common/string_util.h" +#include "cpu_code_cache.h" #include "cpu_core.h" #include "gpu.h" #include "interrupt_controller.h" @@ -499,7 +500,7 @@ TickCount DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 incremen const u32 terminator = UINT32_C(0xFFFFFF); std::memcpy(&ram_pointer[address], &terminator, sizeof(terminator)); - Bus::InvalidateCodePages(address, word_count); + CPU::CodeCache::InvalidateCodePages(address, word_count); return Bus::GetDMARAMTickCount(word_count); } @@ -547,6 +548,6 @@ TickCount DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 incremen } } - Bus::InvalidateCodePages(address, word_count); + CPU::CodeCache::InvalidateCodePages(address, word_count); return Bus::GetDMARAMTickCount(word_count); } diff --git a/src/core/timing_event.cpp b/src/core/timing_event.cpp index 2c8de39ec..37451e926 100644 --- a/src/core/timing_event.cpp +++ b/src/core/timing_event.cpp @@ -57,6 +57,11 @@ void UpdateCPUDowncount() } } +TimingEvent** GetHeadEventPtr() +{ + return &s_active_events_head; +} + static void SortEvent(TimingEvent* event) { const TickCount event_downcount = event->m_downcount; diff --git a/src/core/timing_event.h b/src/core/timing_event.h index ca58ddbdf..0e012a1d7 100644 --- a/src/core/timing_event.h +++ b/src/core/timing_event.h @@ -88,6 +88,8 @@ void RunEvents(); void UpdateCPUDowncount(); +TimingEvent** GetHeadEventPtr(); + } // namespace TimingEventManager \ No newline at end of file