From 2e96931c32d26620f2ca85b4735d6b26ad7226ca Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 19 Jul 2024 19:31:33 +1000 Subject: [PATCH] CPU/CodeCache: Dynamically compute BIOS memory access timing The timings can change if the game does so. Instead of forcing the blocks to recompile, we can just manually multiply size * word_time. Improves stability of Nightmare Creatures booting, and fixes corrupted text in Formula Circus when using the cached interpreter. --- src/core/bus.cpp | 9 +++ src/core/bus.h | 3 + src/core/cpu_code_cache.cpp | 42 ++++++++--- src/core/cpu_code_cache_private.h | 2 + src/core/cpu_core.cpp | 34 ++++----- src/core/cpu_core_private.h | 2 +- src/core/cpu_newrec_compiler.cpp | 11 ++- src/core/cpu_newrec_compiler.h | 2 + src/core/cpu_newrec_compiler_aarch32.cpp | 23 ++++-- src/core/cpu_newrec_compiler_aarch64.cpp | 23 ++++-- src/core/cpu_newrec_compiler_riscv64.cpp | 42 +++++++++-- src/core/cpu_newrec_compiler_x64.cpp | 13 +++- src/core/cpu_recompiler_code_generator.cpp | 11 ++- src/core/cpu_recompiler_code_generator.h | 2 + .../cpu_recompiler_code_generator_aarch32.cpp | 34 ++++++++- .../cpu_recompiler_code_generator_aarch64.cpp | 75 ++++++++++++++++++- .../cpu_recompiler_code_generator_x64.cpp | 17 ++++- src/core/cpu_recompiler_types.h | 14 +++- 18 files changed, 294 insertions(+), 65 deletions(-) diff --git a/src/core/bus.cpp b/src/core/bus.cpp index 6d931ab27..0d75dd9a5 100644 --- a/src/core/bus.cpp +++ b/src/core/bus.cpp @@ -708,6 +708,15 @@ bool Bus::HasCodePagesInRange(PhysicalMemoryAddress start_address, u32 size) return false; } +const TickCount* Bus::GetMemoryAccessTimePtr(PhysicalMemoryAddress address, MemoryAccessSize size) +{ + // Currently only BIOS, but could be EXP1 as well. + if (address >= BIOS_BASE && address < (BIOS_BASE + BIOS_MIRROR_SIZE)) + return &g_bios_access_time[static_cast(size)]; + + return nullptr; +} + std::optional Bus::GetMemoryRegionForAddress(PhysicalMemoryAddress address) { if (address < RAM_2MB_SIZE) diff --git a/src/core/bus.h b/src/core/bus.h index 30023e315..930dc9283 100644 --- a/src/core/bus.h +++ b/src/core/bus.h @@ -191,6 +191,9 @@ ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count) return static_cast(word_count + ((word_count + 15) / 16)); } +/// Returns a pointer to the cycle count for a non-RAM memory access. +const TickCount* GetMemoryAccessTimePtr(PhysicalMemoryAddress address, MemoryAccessSize size); + enum class MemoryRegion { RAM, diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 0a01c527d..1b4a90b62 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -823,8 +823,20 @@ template } DebugAssert(!(HasPendingInterrupt())); - if (g_settings.cpu_recompiler_icache) - CheckAndUpdateICacheTags(block->icache_line_count, block->uncached_fetch_ticks); + if (block->HasFlag(BlockFlags::IsUsingICache)) + { + CheckAndUpdateICacheTags(block->icache_line_count); + } + else if (block->HasFlag(BlockFlags::NeedsDynamicFetchTicks)) + { + AddPendingTicks( + static_cast(block->size * static_cast(*Bus::GetMemoryAccessTimePtr( + block->pc & PHYSICAL_MEMORY_ADDRESS_MASK, MemoryAccessSize::Word)))); + } + else + { + AddPendingTicks(block->uncached_fetch_ticks); + } InterpretCachedBlock(block); @@ -893,6 +905,9 @@ bool CPU::CodeCache::ReadBlockInstructions(u32 start_pc, BlockInstructionList* i // TODO: Jump to other block if it exists at this pc? const PageProtectionMode protection = GetProtectionModeForPC(start_pc); + const bool use_icache = CPU::IsCachedAddress(start_pc); + const bool dynamic_fetch_ticks = (!use_icache && Bus::GetMemoryAccessTimePtr(start_pc & PHYSICAL_MEMORY_ADDRESS_MASK, + MemoryAccessSize::Word) != nullptr); u32 pc = start_pc; bool is_branch_delay_slot = false; bool is_load_delay_slot = false; @@ -905,7 +920,8 @@ bool CPU::CodeCache::ReadBlockInstructions(u32 start_pc, BlockInstructionList* i instructions->clear(); metadata->icache_line_count = 0; metadata->uncached_fetch_ticks = 0; - metadata->flags = BlockFlags::None; + metadata->flags = use_icache ? BlockFlags::IsUsingICache : + (dynamic_fetch_ticks ? BlockFlags::NeedsDynamicFetchTicks : BlockFlags::None); u32 last_cache_line = ICACHE_LINES; u32 last_page = (protection == PageProtectionMode::WriteProtected) ? Bus::GetRAMCodePageIndex(start_pc) : 0; @@ -956,17 +972,23 @@ bool CPU::CodeCache::ReadBlockInstructions(u32 start_pc, BlockInstructionList* i info.is_store_instruction = IsMemoryStoreInstruction(instruction); info.has_load_delay = InstructionHasLoadDelay(instruction); - if (g_settings.cpu_recompiler_icache) + if (use_icache) { - const u32 icache_line = GetICacheLine(pc); - if (icache_line != last_cache_line) + if (g_settings.cpu_recompiler_icache) { - metadata->icache_line_count++; - last_cache_line = icache_line; + const u32 icache_line = GetICacheLine(pc); + if (icache_line != last_cache_line) + { + metadata->icache_line_count++; + last_cache_line = icache_line; + } } } + else if (!dynamic_fetch_ticks) + { + metadata->uncached_fetch_ticks += GetInstructionReadTicks(pc); + } - metadata->uncached_fetch_ticks += GetInstructionReadTicks(pc); if (info.is_load_instruction || info.is_store_instruction) metadata->flags |= BlockFlags::ContainsLoadStoreInstructions; @@ -1022,6 +1044,8 @@ bool CPU::CodeCache::ReadBlockInstructions(u32 start_pc, BlockInstructionList* i #ifdef _DEBUG SmallString disasm; DEBUG_LOG("Block at 0x{:08X}", start_pc); + DEBUG_LOG(" Uncached fetch ticks: {}", metadata->uncached_fetch_ticks); + DEBUG_LOG(" ICache line count: {}", metadata->icache_line_count); for (const auto& cbi : *instructions) { CPU::DisassembleInstruction(&disasm, cbi.second.pc, cbi.first.bits); diff --git a/src/core/cpu_code_cache_private.h b/src/core/cpu_code_cache_private.h index 190ffe62e..024d21cd8 100644 --- a/src/core/cpu_code_cache_private.h +++ b/src/core/cpu_code_cache_private.h @@ -94,6 +94,8 @@ enum class BlockFlags : u8 ContainsLoadStoreInstructions = (1 << 0), SpansPages = (1 << 1), BranchDelaySpansPages = (1 << 2), + IsUsingICache = (1 << 3), + NeedsDynamicFetchTicks = (1 << 4), }; IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(BlockFlags); diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 4e301a19d..3b2ada372 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -2620,7 +2620,7 @@ TickCount CPU::GetInstructionReadTicks(VirtualMemoryAddress address) { return RAM_READ_TICKS; } - else if (address >= BIOS_BASE && address < (BIOS_BASE + BIOS_SIZE)) + else if (address >= BIOS_BASE && address < (BIOS_BASE + BIOS_MIRROR_SIZE)) { return g_bios_access_time[static_cast(MemoryAccessSize::Word)]; } @@ -2640,7 +2640,7 @@ TickCount CPU::GetICacheFillTicks(VirtualMemoryAddress address) { return 1 * ((ICACHE_LINE_SIZE - (address & (ICACHE_LINE_SIZE - 1))) / sizeof(u32)); } - else if (address >= BIOS_BASE && address < (BIOS_BASE + BIOS_SIZE)) + else if (address >= BIOS_BASE && address < (BIOS_BASE + BIOS_MIRROR_SIZE)) { return g_bios_access_time[static_cast(MemoryAccessSize::Word)] * ((ICACHE_LINE_SIZE - (address & (ICACHE_LINE_SIZE - 1))) / sizeof(u32)); @@ -2651,29 +2651,23 @@ TickCount CPU::GetICacheFillTicks(VirtualMemoryAddress address) } } -void CPU::CheckAndUpdateICacheTags(u32 line_count, TickCount uncached_ticks) +void CPU::CheckAndUpdateICacheTags(u32 line_count) { VirtualMemoryAddress current_pc = g_state.pc & ICACHE_TAG_ADDRESS_MASK; - if (IsCachedAddress(current_pc)) - { - TickCount ticks = 0; - TickCount cached_ticks_per_line = GetICacheFillTicks(current_pc); - for (u32 i = 0; i < line_count; i++, current_pc += ICACHE_LINE_SIZE) - { - const u32 line = GetICacheLine(current_pc); - if (g_state.icache_tags[line] != current_pc) - { - g_state.icache_tags[line] = current_pc; - ticks += cached_ticks_per_line; - } - } - g_state.pending_ticks += ticks; - } - else + TickCount ticks = 0; + TickCount cached_ticks_per_line = GetICacheFillTicks(current_pc); + for (u32 i = 0; i < line_count; i++, current_pc += ICACHE_LINE_SIZE) { - g_state.pending_ticks += uncached_ticks; + const u32 line = GetICacheLine(current_pc); + if (g_state.icache_tags[line] != current_pc) + { + g_state.icache_tags[line] = current_pc; + ticks += cached_ticks_per_line; + } } + + g_state.pending_ticks += ticks; } u32 CPU::FillICache(VirtualMemoryAddress address) diff --git a/src/core/cpu_core_private.h b/src/core/cpu_core_private.h index 5b91a193b..e2d89f45e 100644 --- a/src/core/cpu_core_private.h +++ b/src/core/cpu_core_private.h @@ -65,7 +65,7 @@ ALWAYS_INLINE static bool CompareICacheTag(VirtualMemoryAddress address) TickCount GetInstructionReadTicks(VirtualMemoryAddress address); TickCount GetICacheFillTicks(VirtualMemoryAddress address); u32 FillICache(VirtualMemoryAddress address); -void CheckAndUpdateICacheTags(u32 line_count, TickCount uncached_ticks); +void CheckAndUpdateICacheTags(u32 line_count); ALWAYS_INLINE static Segment GetSegmentForAddress(VirtualMemoryAddress address) { diff --git a/src/core/cpu_newrec_compiler.cpp b/src/core/cpu_newrec_compiler.cpp index 2c215e102..35cb6f25c 100644 --- a/src/core/cpu_newrec_compiler.cpp +++ b/src/core/cpu_newrec_compiler.cpp @@ -77,8 +77,7 @@ void CPU::NewRec::Compiler::BeginBlock() GenerateBlockProtectCheck(ram_ptr, shadow_ptr, m_block->size * sizeof(Instruction)); } - if (m_block->uncached_fetch_ticks > 0 || m_block->icache_line_count > 0) - GenerateICacheCheckAndUpdate(); + GenerateICacheCheckAndUpdate(); if (g_settings.bios_tty_logging) { @@ -1719,6 +1718,14 @@ void CPU::NewRec::Compiler::TruncateBlock() iinfo->is_last_instruction = true; } +const TickCount* CPU::NewRec::Compiler::GetFetchMemoryAccessTimePtr() const +{ + const TickCount* ptr = + Bus::GetMemoryAccessTimePtr(m_block->pc & PHYSICAL_MEMORY_ADDRESS_MASK, MemoryAccessSize::Word); + AssertMsg(ptr, "Address has dynamic fetch ticks"); + return ptr; +} + void CPU::NewRec::Compiler::FlushForLoadStore(const std::optional& address, bool store, bool use_fastmem) { diff --git a/src/core/cpu_newrec_compiler.h b/src/core/cpu_newrec_compiler.h index d40a1a6c4..d1e184040 100644 --- a/src/core/cpu_newrec_compiler.h +++ b/src/core/cpu_newrec_compiler.h @@ -201,6 +201,8 @@ protected: void SetCompilerPC(u32 newpc); void TruncateBlock(); + const TickCount* GetFetchMemoryAccessTimePtr() const; + virtual const void* GetCurrentCodePointer() = 0; virtual void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer, diff --git a/src/core/cpu_newrec_compiler_aarch32.cpp b/src/core/cpu_newrec_compiler_aarch32.cpp index 45f2e1ad8..0d0347049 100644 --- a/src/core/cpu_newrec_compiler_aarch32.cpp +++ b/src/core/cpu_newrec_compiler_aarch32.cpp @@ -28,6 +28,7 @@ using namespace vixl::aarch32; using CPU::Recompiler::armEmitCall; using CPU::Recompiler::armEmitCondBranch; +using CPU::Recompiler::armEmitFarLoad; using CPU::Recompiler::armEmitJmp; using CPU::Recompiler::armEmitMov; using CPU::Recompiler::armGetJumpTrampoline; @@ -302,13 +303,25 @@ bool foo(const void* a, const void* b) void CPU::NewRec::AArch32Compiler::GenerateICacheCheckAndUpdate() { - if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1) + if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache)) { - armAsm->ldr(RARG1, PTR(&g_state.pending_ticks)); - armAsm->add(RARG1, RARG1, armCheckAddSubConstant(static_cast(m_block->uncached_fetch_ticks))); - armAsm->str(RARG1, PTR(&g_state.pending_ticks)); + if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) + { + armEmitFarLoad(armAsm, RARG2, GetFetchMemoryAccessTimePtr()); + armAsm->ldr(RARG1, PTR(&g_state.pending_ticks)); + armEmitMov(armAsm, RARG3, m_block->size); + armAsm->mul(RARG2, RARG2, RARG3); + armAsm->add(RARG1, RARG1, RARG2); + armAsm->str(RARG1, PTR(&g_state.pending_ticks)); + } + else + { + armAsm->ldr(RARG1, PTR(&g_state.pending_ticks)); + armAsm->add(RARG1, RARG1, armCheckAddSubConstant(static_cast(m_block->uncached_fetch_ticks))); + armAsm->str(RARG1, PTR(&g_state.pending_ticks)); + } } - else + else if (m_block->icache_line_count > 0) { const auto& ticks_reg = RARG1; const auto& current_tag_reg = RARG2; diff --git a/src/core/cpu_newrec_compiler_aarch64.cpp b/src/core/cpu_newrec_compiler_aarch64.cpp index c3fb3bd99..45d4c8674 100644 --- a/src/core/cpu_newrec_compiler_aarch64.cpp +++ b/src/core/cpu_newrec_compiler_aarch64.cpp @@ -27,6 +27,7 @@ using namespace vixl::aarch64; using CPU::Recompiler::armEmitCall; using CPU::Recompiler::armEmitCondBranch; +using CPU::Recompiler::armEmitFarLoad; using CPU::Recompiler::armEmitJmp; using CPU::Recompiler::armEmitMov; using CPU::Recompiler::armGetJumpTrampoline; @@ -274,13 +275,25 @@ void CPU::NewRec::AArch64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, void CPU::NewRec::AArch64Compiler::GenerateICacheCheckAndUpdate() { - if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1) + if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache)) { - armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); - armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast(m_block->uncached_fetch_ticks))); - armAsm->str(RWARG1, PTR(&g_state.pending_ticks)); + if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) + { + armEmitFarLoad(armAsm, RWARG2, GetFetchMemoryAccessTimePtr()); + armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); + armEmitMov(armAsm, RWARG3, m_block->size); + armAsm->mul(RWARG2, RWARG2, RWARG3); + armAsm->add(RWARG1, RWARG1, RWARG2); + armAsm->str(RWARG1, PTR(&g_state.pending_ticks)); + } + else + { + armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); + armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast(m_block->uncached_fetch_ticks))); + armAsm->str(RWARG1, PTR(&g_state.pending_ticks)); + } } - else + else if (m_block->icache_line_count > 0) { const auto& ticks_reg = RWARG1; const auto& current_tag_reg = RWARG2; diff --git a/src/core/cpu_newrec_compiler_riscv64.cpp b/src/core/cpu_newrec_compiler_riscv64.cpp index 6ae4cf24c..61a6557ac 100644 --- a/src/core/cpu_newrec_compiler_riscv64.cpp +++ b/src/core/cpu_newrec_compiler_riscv64.cpp @@ -40,6 +40,7 @@ using namespace biscuit; using CPU::Recompiler::rvEmitCall; using CPU::Recompiler::rvEmitDSExtW; using CPU::Recompiler::rvEmitDUExtW; +using CPU::Recompiler::rvEmitFarLoad; using CPU::Recompiler::rvEmitJmp; using CPU::Recompiler::rvEmitMov; using CPU::Recompiler::rvEmitMov64; @@ -130,6 +131,25 @@ u32 CPU::Recompiler::rvEmitCall(biscuit::Assembler* rvAsm, const void* ptr) return rvEmitJmp(rvAsm, ptr, biscuit::ra); } +void CPU::Recompiler::rvEmitFarLoad(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr, + bool sign_extend_word) +{ + const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), addr); + rvAsm->AUIPC(reg, hi); + if (sign_extend_word) + rvAsm->LW(reg, lo, reg); + else + rvAsm->LWU(reg, lo, reg); +} + +void CPU::Recompiler::rvEmitFarStore(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr, + const biscuit::GPR& tempreg) +{ + const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), addr); + rvAsm->AUIPC(tempreg, hi); + rvAsm->SW(reg, lo, tempreg); +} + void CPU::Recompiler::rvEmitSExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs) { rvAsm->SLLI(rd, rs, 24); @@ -525,13 +545,25 @@ void CPU::NewRec::RISCV64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, void CPU::NewRec::RISCV64Compiler::GenerateICacheCheckAndUpdate() { - if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1) + if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache)) { - rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); - SafeADDIW(RARG1, RARG1, static_cast(m_block->uncached_fetch_ticks)); - rvAsm->SW(RARG1, PTR(&g_state.pending_ticks)); + if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) + { + rvEmitFarLoad(rvAsm, RARG2, GetFetchMemoryAccessTimePtr()); + rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); + rvEmitMov(rvAsm, RARG3, m_block->size); + rvAsm->MULW(RARG2, RARG2, RARG3); + rvAsm->ADD(RARG1, RARG1, RARG2); + rvAsm->SW(RARG1, PTR(&g_state.pending_ticks)); + } + else + { + rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); + SafeADDIW(RARG1, RARG1, static_cast(m_block->uncached_fetch_ticks)); + rvAsm->SW(RARG1, PTR(&g_state.pending_ticks)); + } } - else + else if (m_block->icache_line_count > 0) { const auto& ticks_reg = RARG1; const auto& current_tag_reg = RARG2; diff --git a/src/core/cpu_newrec_compiler_x64.cpp b/src/core/cpu_newrec_compiler_x64.cpp index 81385c690..88ccf5bca 100644 --- a/src/core/cpu_newrec_compiler_x64.cpp +++ b/src/core/cpu_newrec_compiler_x64.cpp @@ -179,9 +179,18 @@ void CPU::NewRec::X64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, cons void CPU::NewRec::X64Compiler::GenerateICacheCheckAndUpdate() { - if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1) + if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache)) { - cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast(m_block->uncached_fetch_ticks)); + if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) + { + cg->mov(cg->eax, m_block->size); + cg->mul(cg->dword[cg->rip + GetFetchMemoryAccessTimePtr()]); + cg->add(cg->dword[PTR(&g_state.pending_ticks)], cg->eax); + } + else + { + cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast(m_block->uncached_fetch_ticks)); + } } else if (m_block->icache_line_count > 0) { diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index 9c0f78abd..f9cc1b6f7 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -926,6 +926,14 @@ Value CodeGenerator::NotValue(const Value& val) return res; } +const TickCount* CodeGenerator::GetFetchMemoryAccessTimePtr() const +{ + const TickCount* ptr = + Bus::GetMemoryAccessTimePtr(m_block->pc & PHYSICAL_MEMORY_ADDRESS_MASK, MemoryAccessSize::Word); + AssertMsg(ptr, "Address has dynamic fetch ticks"); + return ptr; +} + void CodeGenerator::GenerateExceptionExit(Instruction instruction, const CodeCache::InstructionInfo& info, Exception excode, Condition condition /* = Condition::Always */) { @@ -996,8 +1004,7 @@ void CodeGenerator::BlockPrologue() EmitFunctionCall(nullptr, &CPU::HandleB0Syscall); } - if (m_block->uncached_fetch_ticks > 0 || m_block->icache_line_count > 0) - EmitICacheCheckAndUpdate(); + EmitICacheCheckAndUpdate(); // we don't know the state of the last block, so assume load delays might be in progress // TODO: Pull load delay into register cache diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 35f15bb82..d0744dba6 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -198,6 +198,8 @@ public: Value XorValues(const Value& lhs, const Value& rhs); Value NotValue(const Value& val); + const TickCount* GetFetchMemoryAccessTimePtr() const; + // Raising exception if condition is true. void GenerateExceptionExit(Instruction instruction, const CodeCache::InstructionInfo& info, Exception excode, Condition condition = Condition::Always); diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index afb6f92ac..f95140713 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -140,6 +140,20 @@ void CPU::Recompiler::armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl:: } } +void CPU::Recompiler::armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, + const void* addr) +{ + armMoveAddressToReg(armAsm, reg, addr); + armAsm->ldr(reg, vixl::aarch32::MemOperand(reg)); +} + +void CPU::Recompiler::armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, + const void* addr, const vixl::aarch64::Register& tempreg) +{ + armMoveAddressToReg(armAsm, tempreg, addr); + armAsm->str(reg, vixl::aarch32::MemOperand(tempreg)); +} + void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size) { #ifdef ENABLE_HOST_DISASSEMBLY @@ -1913,12 +1927,24 @@ void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg) void CodeGenerator::EmitICacheCheckAndUpdate() { - if (GetSegmentForAddress(m_pc) >= Segment::KSEG1) + if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache)) { - EmitAddCPUStructField(OFFSETOF(State, pending_ticks), - Value::FromConstantU32(static_cast(m_block->uncached_fetch_ticks))); + if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) + { + armEmitFarLoad(m_emit, RARG2, GetFetchMemoryAccessTimePtr()); + m_emit->ldr(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks))); + m_emit->Mov(RARG3, m_block->size); + m_emit->mul(RARG2, RARG2, RARG3); + m_emit->add(RARG1, RARG1, RARG2); + m_emit->str(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks))); + } + else + { + EmitAddCPUStructField(OFFSETOF(State, pending_ticks), + Value::FromConstantU32(static_cast(m_block->uncached_fetch_ticks))); + } } - else + else if (m_block->icache_line_count > 0) { const auto& ticks_reg = a32::r0; const auto& current_tag_reg = a32::r1; diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 6fff92998..57b67d446 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -261,6 +261,61 @@ void CPU::Recompiler::armEmitCondBranch(a64::Assembler* armAsm, a64::Condition c } } +void CPU::Recompiler::armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, + const void* addr, bool sign_extend_word) +{ + const void* cur = armAsm->GetCursorAddress(); + const void* current_code_ptr_page = + reinterpret_cast(reinterpret_cast(cur) & ~static_cast(0xFFF)); + const void* ptr_page = + reinterpret_cast(reinterpret_cast(addr) & ~static_cast(0xFFF)); + const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; + const u32 page_offset = static_cast(reinterpret_cast(addr) & 0xFFFu); + a64::MemOperand memop; + + const vixl::aarch64::Register xreg = reg.X(); + if (vixl::IsInt21(page_displacement)) + { + armAsm->adrp(xreg, page_displacement); + memop = vixl::aarch64::MemOperand(xreg, static_cast(page_offset)); + } + else + { + armMoveAddressToReg(armAsm, xreg, addr); + memop = vixl::aarch64::MemOperand(xreg); + } + + if (sign_extend_word) + armAsm->ldrsw(reg, memop); + else + armAsm->ldr(reg, memop); +} + +void CPU::Recompiler::armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, + const void* addr, const vixl::aarch64::Register& tempreg) +{ + DebugAssert(tempreg.IsX()); + + const void* cur = armAsm->GetCursorAddress(); + const void* current_code_ptr_page = + reinterpret_cast(reinterpret_cast(cur) & ~static_cast(0xFFF)); + const void* ptr_page = + reinterpret_cast(reinterpret_cast(addr) & ~static_cast(0xFFF)); + const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; + const u32 page_offset = static_cast(reinterpret_cast(addr) & 0xFFFu); + + if (vixl::IsInt21(page_displacement)) + { + armAsm->adrp(tempreg, page_displacement); + armAsm->str(reg, vixl::aarch64::MemOperand(tempreg, static_cast(page_offset))); + } + else + { + armMoveAddressToReg(armAsm, tempreg, addr); + armAsm->str(reg, vixl::aarch64::MemOperand(tempreg)); + } +} + u8* CPU::Recompiler::armGetJumpTrampoline(const void* target) { auto it = s_trampoline_targets.find(target); @@ -2240,12 +2295,24 @@ void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg) void CodeGenerator::EmitICacheCheckAndUpdate() { - if (GetSegmentForAddress(m_pc) >= Segment::KSEG1) + if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache)) { - EmitAddCPUStructField(OFFSETOF(State, pending_ticks), - Value::FromConstantU32(static_cast(m_block->uncached_fetch_ticks))); + if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) + { + armEmitFarLoad(m_emit, RWARG2, GetFetchMemoryAccessTimePtr()); + m_emit->Ldr(RWARG1, a64::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks))); + m_emit->Mov(RWARG3, m_block->size); + m_emit->Mul(RWARG2, RWARG2, RWARG3); + m_emit->Add(RWARG1, RWARG1, RWARG2); + m_emit->Str(RWARG1, a64::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks))); + } + else + { + EmitAddCPUStructField(OFFSETOF(State, pending_ticks), + Value::FromConstantU32(static_cast(m_block->uncached_fetch_ticks))); + } } - else + else if (m_block->icache_line_count > 0) { const auto& ticks_reg = a64::w0; const auto& current_tag_reg = a64::w1; diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index d8868044e..88eceac46 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -2782,12 +2782,21 @@ void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg) void CodeGenerator::EmitICacheCheckAndUpdate() { - if (GetSegmentForAddress(m_pc) >= Segment::KSEG1) + if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache)) { - m_emit->add(m_emit->dword[GetCPUPtrReg() + OFFSETOF(State, pending_ticks)], - static_cast(m_block->uncached_fetch_ticks)); + if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) + { + m_emit->mov(m_emit->eax, m_block->size); + m_emit->mul(m_emit->dword[m_emit->rip + GetFetchMemoryAccessTimePtr()]); + m_emit->add(m_emit->dword[GetCPUPtrReg() + OFFSETOF(State, pending_ticks)], m_emit->eax); + } + else + { + m_emit->add(m_emit->dword[GetCPUPtrReg() + OFFSETOF(State, pending_ticks)], + static_cast(m_block->uncached_fetch_ticks)); + } } - else + else if (m_block->icache_line_count > 0) { VirtualMemoryAddress current_pc = m_pc & ICACHE_TAG_ADDRESS_MASK; for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h index de71b9f2b..4a0e3e330 100644 --- a/src/core/cpu_recompiler_types.h +++ b/src/core/cpu_recompiler_types.h @@ -93,6 +93,9 @@ void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline); void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline); void armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl::aarch32::Condition cond, const void* ptr); +void armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr); +void armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr, + const vixl::aarch32::Register& tempreg = RSCRATCH); u8* armGetJumpTrampoline(const void* target); } // namespace CPU::Recompiler @@ -129,6 +132,10 @@ void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline); void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline); void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr); +void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr, + bool sign_extend_word = false); +void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr, + const vixl::aarch64::Register& tempreg = RXSCRATCH); u8* armGetJumpTrampoline(const void* target); } // namespace CPU::Recompiler @@ -157,8 +164,11 @@ std::pair rvGetAddressImmediates(const void* cur, const void* target); void rvMoveAddressToReg(biscuit::Assembler* armAsm, const biscuit::GPR& reg, const void* addr); void rvEmitMov(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, u32 imm); void rvEmitMov64(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& scratch, u64 imm); -u32 rvEmitJmp(biscuit::Assembler* armAsm, const void* ptr, const biscuit::GPR& link_reg = biscuit::zero); -u32 rvEmitCall(biscuit::Assembler* armAsm, const void* ptr); +u32 rvEmitJmp(biscuit::Assembler* rvAsm, const void* ptr, const biscuit::GPR& link_reg = biscuit::zero); +u32 rvEmitCall(biscuit::Assembler* rvAsm, const void* ptr); +void rvEmitFarLoad(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr, bool sign_extend_word = false); +void rvEmitFarStore(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr, + const biscuit::GPR& tempreg = RSCRATCH); void rvEmitSExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> word void rvEmitUExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> word void rvEmitSExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> word