From 21938e14c6f8a413d5c98341d8289a45d3e2e558 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sat, 22 May 2021 14:55:25 +1000 Subject: [PATCH] CPU/Recompiler: Implement block linking --- src/core/cpu_code_cache.cpp | 140 ++++++++-- src/core/cpu_code_cache.h | 14 +- src/core/cpu_recompiler_code_generator.cpp | 250 ++++++++++++++---- src/core/cpu_recompiler_code_generator.h | 9 +- .../cpu_recompiler_code_generator_aarch32.cpp | 86 ++++-- .../cpu_recompiler_code_generator_aarch64.cpp | 94 +++++-- .../cpu_recompiler_code_generator_x64.cpp | 137 ++++++++-- src/core/cpu_recompiler_register_cache.cpp | 15 ++ src/core/cpu_recompiler_register_cache.h | 13 + src/core/cpu_recompiler_thunks.h | 4 + src/core/cpu_types.cpp | 11 +- src/core/cpu_types.h | 2 +- src/core/host_interface.cpp | 22 +- src/core/settings.cpp | 2 + src/core/settings.h | 1 + src/duckstation-qt/advancedsettingswidget.cpp | 25 +- src/frontend-common/fullscreen_ui.cpp | 6 + 17 files changed, 666 insertions(+), 165 deletions(-) diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 40d17d6b7..d8947f539 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -38,6 +38,7 @@ static constexpr u32 RECOMPILER_CODE_CACHE_SIZE = 32 * 1024 * 1024; static constexpr u32 RECOMPILER_FAR_CODE_CACHE_SIZE = 16 * 1024 * 1024; #endif static constexpr u32 CODE_WRITE_FAULT_THRESHOLD_FOR_SLOWMEM = 10; +static constexpr u32 INVALIDATE_THRESHOLD_TO_DISABLE_LINKING = 10; #ifdef USE_STATIC_CODE_BUFFER static constexpr u32 RECOMPILER_GUARD_SIZE = 4096; @@ -205,8 +206,8 @@ static void RemoveReferencesToBlock(CodeBlock* block); static void AddBlockToPageMap(CodeBlock* block); static void RemoveBlockFromPageMap(CodeBlock* block); -/// Link block from to to. -static void LinkBlock(CodeBlock* from, CodeBlock* to); +/// Link block from to to. Returns the successor index. +static void LinkBlock(CodeBlock* from, CodeBlock* to, void* host_pc, void* host_resolve_pc, u32 host_pc_size); /// Unlink all blocks which point to this block, and any that this block links to. static void UnlinkBlock(CodeBlock* block); @@ -352,8 +353,9 @@ static void ExecuteImpl() { // Try to find an already-linked block. // TODO: Don't need to dereference the block, just store a pointer to the code. - for (CodeBlock* linked_block : block->link_successors) + for (const CodeBlock::LinkInfo& li : block->link_successors) { + CodeBlock* linked_block = li.block; if (linked_block->key.bits == next_block_key.bits) { if (linked_block->invalidated && !RevalidateBlock(linked_block)) @@ -373,7 +375,7 @@ static void ExecuteImpl() if (next_block) { // Link the previous block to this new block if we find a new block. - LinkBlock(block, next_block); + LinkBlock(block, next_block, nullptr, nullptr, 0); block = next_block; goto reexecute_block; } @@ -686,6 +688,7 @@ bool CompileBlock(CodeBlock* block) cbi.is_store_instruction = IsMemoryStoreInstruction(cbi.instruction); cbi.has_load_delay = InstructionHasLoadDelay(cbi.instruction); cbi.can_trap = CanInstructionTrap(cbi.instruction, InUserMode()); + cbi.is_direct_branch_instruction = IsDirectBranchInstruction(cbi.instruction); if (g_settings.cpu_recompiler_icache) { @@ -718,7 +721,7 @@ bool CompileBlock(CodeBlock* block) } // change the pc for the second branch's delay slot, it comes from the first branch - pc = GetBranchInstructionTarget(prev_cbi.instruction, prev_cbi.pc); + pc = GetDirectBranchTarget(prev_cbi.instruction, prev_cbi.pc); Log_DevPrintf("Double branch at %08X, using delay slot from %08X -> %08X", cbi.pc, prev_cbi.pc, pc); } @@ -840,6 +843,25 @@ void InvalidateBlocksWithPageIndex(u32 page_index) // Invalidate forces the block to be checked again. Log_DebugPrintf("Invalidating block at 0x%08X", block->GetPC()); block->invalidated = true; + + if (block->can_link) + { + const u32 frame_number = System::GetFrameNumber(); + const u32 frame_diff = frame_number - block->invalidate_frame_number; + if (frame_diff <= INVALIDATE_THRESHOLD_TO_DISABLE_LINKING) + { + Log_PerfPrintf("Block 0x%08X has been invalidated in %u frames, disabling linking", block->GetPC(), frame_diff); + block->can_link = false; + } + else + { + // It's been a while since this block was modified, so it's all good. + block->invalidate_frame_number = frame_number; + } + } + + UnlinkBlock(block); + #ifdef WITH_RECOMPILER SetFastMap(block->GetPC(), FastCompileBlockFunction); #endif @@ -902,30 +924,80 @@ void RemoveBlockFromPageMap(CodeBlock* block) } } -void LinkBlock(CodeBlock* from, CodeBlock* to) +void LinkBlock(CodeBlock* from, CodeBlock* to, void* host_pc, void* host_resolve_pc, u32 host_pc_size) { Log_DebugPrintf("Linking block %p(%08x) to %p(%08x)", from, from->GetPC(), to, to->GetPC()); - from->link_successors.push_back(to); - to->link_predecessors.push_back(from); + + CodeBlock::LinkInfo li; + li.block = to; + li.host_pc = host_pc; + li.host_resolve_pc = host_resolve_pc; + li.host_pc_size = host_pc_size; + from->link_successors.push_back(li); + + li.block = from; + to->link_predecessors.push_back(li); + + // apply in code + if (host_pc) + { + Log_ProfilePrintf("Backpatching %p(%08x) to jump to block %p (%08x)", host_pc, from->GetPC(), to, to->GetPC()); + s_code_buffer.WriteProtect(false); + Recompiler::CodeGenerator::BackpatchBranch(host_pc, host_pc_size, reinterpret_cast(to->host_code)); + s_code_buffer.WriteProtect(true); + } } void UnlinkBlock(CodeBlock* block) { - for (CodeBlock* predecessor : block->link_predecessors) + if (block->link_predecessors.empty() && block->link_successors.empty()) + return; + +#ifdef WITH_RECOMPILER + if (g_settings.IsUsingRecompiler() && g_settings.cpu_recompiler_block_linking) + s_code_buffer.WriteProtect(false); +#endif + + for (CodeBlock::LinkInfo& li : block->link_predecessors) { - auto iter = std::find(predecessor->link_successors.begin(), predecessor->link_successors.end(), block); - Assert(iter != predecessor->link_successors.end()); - predecessor->link_successors.erase(iter); + auto iter = std::find_if(li.block->link_successors.begin(), li.block->link_successors.end(), + [block](const CodeBlock::LinkInfo& li) { return li.block == block; }); + Assert(iter != li.block->link_successors.end()); + + // Restore blocks linked to this block back to the resolver + if (li.host_pc) + { + Log_ProfilePrintf("Backpatching %p(%08x) [predecessor] to jump to resolver", li.host_pc, li.block->GetPC()); + Recompiler::CodeGenerator::BackpatchBranch(li.host_pc, li.host_pc_size, li.host_resolve_pc); + } + + li.block->link_successors.erase(iter); } block->link_predecessors.clear(); - for (CodeBlock* successor : block->link_successors) + for (CodeBlock::LinkInfo& li : block->link_successors) { - auto iter = std::find(successor->link_predecessors.begin(), successor->link_predecessors.end(), block); - Assert(iter != successor->link_predecessors.end()); - successor->link_predecessors.erase(iter); + auto iter = std::find_if(li.block->link_predecessors.begin(), li.block->link_predecessors.end(), + [block](const CodeBlock::LinkInfo& li) { return li.block == block; }); + Assert(iter != li.block->link_predecessors.end()); + + // Restore blocks we're linking to back to the resolver, since the successor won't be linked to us to backpatch if + // it changes. + if (li.host_pc) + { + Log_ProfilePrintf("Backpatching %p(%08x) [successor] to jump to resolver", li.host_pc, li.block->GetPC()); + Recompiler::CodeGenerator::BackpatchBranch(li.host_pc, li.host_pc_size, li.host_resolve_pc); + } + + // Don't have to do anything special for successors - just let the successor know it's no longer linked. + li.block->link_predecessors.erase(iter); } block->link_successors.clear(); + +#ifdef WITH_RECOMPILER + if (g_settings.IsUsingRecompiler() && g_settings.cpu_recompiler_block_linking) + s_code_buffer.WriteProtect(true); +#endif } #ifdef WITH_RECOMPILER @@ -1104,3 +1176,39 @@ Common::PageFaultHandler::HandlerResult LUTPageFaultHandler(void* exception_pc, #endif // WITH_RECOMPILER } // namespace CPU::CodeCache + +#ifdef WITH_RECOMPILER + +void CPU::Recompiler::Thunks::ResolveBranch(CodeBlock* block, void* host_pc, void* host_resolve_pc, u32 host_pc_size) +{ + using namespace CPU::CodeCache; + + CodeBlockKey key = GetNextBlockKey(); + CodeBlock* successor_block = LookupBlock(key); + if (!successor_block || (successor_block->invalidated && !RevalidateBlock(successor_block)) || !block->can_link || + !successor_block->can_link) + { + // just turn it into a return to the dispatcher instead. + s_code_buffer.WriteProtect(false); + CodeGenerator::BackpatchReturn(host_pc, host_pc_size); + s_code_buffer.WriteProtect(true); + } + else + { + // link blocks! + LinkBlock(block, successor_block, host_pc, host_resolve_pc, host_pc_size); + } +} + +void CPU::Recompiler::Thunks::LogPC(u32 pc) +{ +#if 0 + CPU::CodeCache::LogCurrentState(); +#endif +#if 0 + if (TimingEvents::GetGlobalTickCounter() + GetPendingTicks() == 382856482) + __debugbreak(); +#endif +} + +#endif // WITH_RECOMPILER diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index a32bd944d..1b4debe5d 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -60,6 +60,14 @@ struct CodeBlock { using HostCodePointer = void (*)(); + struct LinkInfo + { + CodeBlock* block; + void* host_pc; + void* host_resolve_pc; + u32 host_pc_size; + }; + CodeBlock(const CodeBlockKey key_) : key(key_) {} CodeBlockKey key; @@ -67,8 +75,8 @@ struct CodeBlock HostCodePointer host_code = nullptr; std::vector instructions; - std::vector link_predecessors; - std::vector link_successors; + std::vector link_predecessors; + std::vector link_successors; TickCount uncached_fetch_ticks = 0; u32 icache_line_count = 0; @@ -80,9 +88,11 @@ struct CodeBlock bool contains_loadstore_instructions = false; bool contains_double_branches = false; bool invalidated = false; + bool can_link = true; u32 recompile_frame_number = 0; u32 recompile_count = 0; + u32 invalidate_frame_number = 0; const u32 GetPC() const { return key.GetPC(); } const u32 GetSizeInBytes() const { return static_cast(instructions.size()) * sizeof(Instruction); } diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index bead34870..eec108cb9 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -21,23 +21,20 @@ bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* o m_block = block; m_block_start = block->instructions.data(); m_block_end = block->instructions.data() + block->instructions.size(); + m_pc = block->GetPC(); m_pc_valid = true; - EmitBeginBlock(); + m_fastmem_load_base_in_register = false; + m_fastmem_store_base_in_register = false; + + EmitBeginBlock(true); BlockPrologue(); - const CodeBlockInstruction* cbi = m_block_start; - while (cbi != m_block_end) + m_current_instruction = m_block_start; + while (m_current_instruction != m_block_end) { -#ifdef _DEBUG - SmallString disasm; - DisassembleInstruction(&disasm, cbi->pc, cbi->instruction.bits); - Log_DebugPrintf("Compiling instruction '%s'", disasm.GetCharArray()); -#endif - - m_current_instruction = cbi; - if (!CompileInstruction(*cbi)) + if (!CompileInstruction(*m_current_instruction)) { m_current_instruction = nullptr; m_block_end = nullptr; @@ -46,11 +43,14 @@ bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* o return false; } - cbi++; + m_current_instruction++; } - BlockEpilogue(); - EmitEndBlock(); + if (!m_block_linked) + { + BlockEpilogue(); + EmitEndBlock(true, true); + } FinalizeBlock(out_host_code, out_host_code_size); Log_ProfilePrintf("JIT block 0x%08X: %zu instructions (%u bytes), %u host bytes", block->GetPC(), @@ -957,6 +957,10 @@ void CodeGenerator::BlockPrologue() EmitStoreCPUStructField(offsetof(State, exception_raised), Value::FromConstantU8(0)); +#if 0 + EmitFunctionCall(nullptr, &Thunks::LogPC, Value::FromConstantU32(m_pc)); +#endif + if (m_block->uncached_fetch_ticks > 0 || m_block->icache_line_count > 0) EmitICacheCheckAndUpdate(); @@ -2184,7 +2188,10 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) { InstructionPrologue(cbi, 1); - auto DoBranch = [this](Condition condition, const Value& lhs, const Value& rhs, Reg lr_reg, Value&& branch_target) { + auto DoBranch = [this, &cbi](Condition condition, const Value& lhs, const Value& rhs, Reg lr_reg, + Value&& branch_target) { + const bool can_link_block = cbi.is_direct_branch_instruction && g_settings.cpu_recompiler_block_linking; + // ensure the lr register is flushed, since we want it's correct value after the branch // we don't want to invalidate it yet because of "jalr r0, r0", branch_target could be the lr_reg. if (lr_reg != Reg::count && lr_reg != Reg::zero) @@ -2199,16 +2206,58 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) EmitCopyValue(next_pc.GetHostRegister(), CalculatePC(4)); } - LabelType branch_not_taken; + Value take_branch; + LabelType branch_taken, branch_not_taken; if (condition != Condition::Always) { - // condition is inverted because we want the case for skipping it - if (lhs.IsValid() && rhs.IsValid()) - EmitConditionalBranch(condition, true, lhs.host_reg, rhs, &branch_not_taken); - else if (lhs.IsValid()) - EmitConditionalBranch(condition, true, lhs.host_reg, lhs.size, &branch_not_taken); + if (!can_link_block) + { + // condition is inverted because we want the case for skipping it + if (lhs.IsValid() && rhs.IsValid()) + EmitConditionalBranch(condition, true, lhs.host_reg, rhs, &branch_not_taken); + else if (lhs.IsValid()) + EmitConditionalBranch(condition, true, lhs.host_reg, lhs.size, &branch_not_taken); + else + EmitConditionalBranch(condition, true, &branch_not_taken); + } else - EmitConditionalBranch(condition, true, &branch_not_taken); + { + take_branch = m_register_cache.AllocateScratch(RegSize_32); + switch (condition) + { + case Condition::NotEqual: + case Condition::Equal: + case Condition::Overflow: + case Condition::Greater: + case Condition::GreaterEqual: + case Condition::LessEqual: + case Condition::Less: + case Condition::Above: + case Condition::AboveEqual: + case Condition::Below: + case Condition::BelowEqual: + { + EmitCmp(lhs.GetHostRegister(), rhs); + EmitSetConditionResult(take_branch.GetHostRegister(), take_branch.size, condition); + } + break; + + case Condition::Negative: + case Condition::PositiveOrZero: + case Condition::NotZero: + case Condition::Zero: + { + Assert(!rhs.IsValid() || (rhs.IsConstant() && rhs.GetS64ConstantValue() == 0)); + EmitTest(lhs.GetHostRegister(), lhs); + EmitSetConditionResult(take_branch.GetHostRegister(), take_branch.size, condition); + } + break; + + default: + UnreachableCode(); + break; + } + } } // save the old PC if we want to @@ -2218,6 +2267,9 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) // if we don't cancel it, at the end of the instruction the value we write can be overridden. EmitCancelInterpreterLoadDelayForReg(lr_reg); EmitStoreGuestRegister(lr_reg, next_pc); + + // now invalidate lr because it was possibly written in the branch + m_register_cache.InvalidateGuestRegister(lr_reg); } // we don't need to test the address of constant branches unless they're definitely misaligned, which would be @@ -2256,24 +2308,125 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) m_register_cache.PopState(); } - if (condition != Condition::Always) + if (can_link_block) { - // branch taken path - modify the next pc - EmitCopyValue(next_pc.GetHostRegister(), branch_target); + // if it's an in-block branch, compile the delay slot now + // TODO: Make this more optimal by moving the condition down if it's a nop + Assert((m_current_instruction + 1) != m_block_end); + InstructionEpilogue(cbi); + m_current_instruction++; + if (!CompileInstruction(*m_current_instruction)) + return false; - // converge point - EmitBindLabel(&branch_not_taken); - WriteNewPC(next_pc, true); + // flush all regs since we're at the end of the block now + BlockEpilogue(); + m_block_linked = true; + + // check downcount + Value pending_ticks = m_register_cache.AllocateScratch(RegSize_32); + Value downcount = m_register_cache.AllocateScratch(RegSize_32); + EmitLoadCPUStructField(pending_ticks.GetHostRegister(), RegSize_32, offsetof(State, pending_ticks)); + EmitLoadCPUStructField(downcount.GetHostRegister(), RegSize_32, offsetof(State, downcount)); + + // pending < downcount + LabelType return_to_dispatcher; + + if (condition != Condition::Always) + { + EmitBranchIfBitClear(take_branch.GetHostRegister(), take_branch.size, 0, &branch_not_taken); + m_register_cache.PushState(); + { + WriteNewPC(branch_target, false); + EmitConditionalBranch(Condition::GreaterEqual, false, pending_ticks.GetHostRegister(), downcount, + &return_to_dispatcher); + + // we're committed at this point :D + EmitStoreCPUStructField(offsetof(State, current_instruction_pc), branch_target); + EmitEndBlock(true, false); + + const void* jump_pointer = GetCurrentCodePointer(); + const void* resolve_pointer = GetCurrentFarCodePointer(); + EmitBranch(resolve_pointer); + const u32 jump_size = static_cast(static_cast(GetCurrentCodePointer()) - + static_cast(jump_pointer)); + SwitchToFarCode(); + + EmitBeginBlock(true); + EmitFunctionCall(nullptr, &CPU::Recompiler::Thunks::ResolveBranch, Value::FromConstantPtr(m_block), + Value::FromConstantPtr(jump_pointer), Value::FromConstantPtr(resolve_pointer), + Value::FromConstantU32(jump_size)); + EmitEndBlock(true, true); + } + m_register_cache.PopState(); + + SwitchToNearCode(); + EmitBindLabel(&branch_not_taken); + } + + m_register_cache.PushState(); + + if (condition != Condition::Always) + { + WriteNewPC(next_pc, true); + EmitStoreCPUStructField(offsetof(State, current_instruction_pc), next_pc); + } + else + { + WriteNewPC(branch_target, true); + EmitStoreCPUStructField(offsetof(State, current_instruction_pc), branch_target); + } + + EmitConditionalBranch(Condition::GreaterEqual, false, pending_ticks.GetHostRegister(), downcount, + &return_to_dispatcher); + + if (condition != Condition::Always) + EmitStoreCPUStructField(offsetof(State, current_instruction_pc), next_pc); + else + EmitStoreCPUStructField(offsetof(State, current_instruction_pc), branch_target); + + EmitEndBlock(true, false); + + const void* jump_pointer = GetCurrentCodePointer(); + const void* resolve_pointer = GetCurrentFarCodePointer(); + EmitBranch(GetCurrentFarCodePointer()); + const u32 jump_size = + static_cast(static_cast(GetCurrentCodePointer()) - static_cast(jump_pointer)); + SwitchToFarCode(); + + EmitBeginBlock(true); + EmitFunctionCall(nullptr, &CPU::Recompiler::Thunks::ResolveBranch, Value::FromConstantPtr(m_block), + Value::FromConstantPtr(jump_pointer), Value::FromConstantPtr(resolve_pointer), + Value::FromConstantU32(jump_size)); + EmitEndBlock(true, true); + + m_register_cache.PopState(); + + SwitchToNearCode(); + EmitBindLabel(&return_to_dispatcher); + EmitEndBlock(true, true); } else { - // next_pc is not used for unconditional branches - WriteNewPC(branch_target, true); + if (condition != Condition::Always) + { + // branch taken path - modify the next pc + EmitBindLabel(&branch_taken); + EmitCopyValue(next_pc.GetHostRegister(), branch_target); + + // converge point + EmitBindLabel(&branch_not_taken); + WriteNewPC(next_pc, true); + } + else + { + // next_pc is not used for unconditional branches + WriteNewPC(branch_target, true); + } + + InstructionEpilogue(cbi); } - // now invalidate lr becuase it was possibly written in the branch - if (lr_reg != Reg::count && lr_reg != Reg::zero) - m_register_cache.InvalidateGuestRegister(lr_reg); + return true; }; // Compute the branch target. @@ -2287,10 +2440,9 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) Value branch_target = OrValues(AndValues(CalculatePC(), Value::FromConstantU32(0xF0000000)), Value::FromConstantU32(cbi.instruction.j.target << 2)); - DoBranch(Condition::Always, Value(), Value(), (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count, - std::move(branch_target)); + return DoBranch(Condition::Always, Value(), Value(), + (cbi.instruction.op == InstructionOp::jal) ? Reg::ra : Reg::count, std::move(branch_target)); } - break; case InstructionOp::funct: { @@ -2298,9 +2450,9 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) { // npc = rs, link to rt Value branch_target = m_register_cache.ReadGuestRegister(cbi.instruction.r.rs); - DoBranch(Condition::Always, Value(), Value(), - (cbi.instruction.r.funct == InstructionFunct::jalr) ? cbi.instruction.r.rd : Reg::count, - std::move(branch_target)); + return DoBranch(Condition::Always, Value(), Value(), + (cbi.instruction.r.funct == InstructionFunct::jalr) ? cbi.instruction.r.rd : Reg::count, + std::move(branch_target)); } else if (cbi.instruction.r.funct == InstructionFunct::syscall || cbi.instruction.r.funct == InstructionFunct::break_) @@ -2308,13 +2460,15 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) const Exception excode = (cbi.instruction.r.funct == InstructionFunct::syscall) ? Exception::Syscall : Exception::BP; GenerateExceptionExit(cbi, excode); + InstructionEpilogue(cbi); + return true; } else { UnreachableCode(); + return false; } } - break; case InstructionOp::beq: case InstructionOp::bne: @@ -2326,7 +2480,7 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) if (cbi.instruction.op == InstructionOp::beq && cbi.instruction.i.rs == Reg::zero && cbi.instruction.i.rt == Reg::zero) { - DoBranch(Condition::Always, Value(), Value(), Reg::count, std::move(branch_target)); + return DoBranch(Condition::Always, Value(), Value(), Reg::count, std::move(branch_target)); } else { @@ -2334,10 +2488,9 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) Value lhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rs, true, true); Value rhs = m_register_cache.ReadGuestRegister(cbi.instruction.i.rt); const Condition condition = (cbi.instruction.op == InstructionOp::beq) ? Condition::Equal : Condition::NotEqual; - DoBranch(condition, lhs, rhs, Reg::count, std::move(branch_target)); + return DoBranch(condition, lhs, rhs, Reg::count, std::move(branch_target)); } } - break; case InstructionOp::bgtz: case InstructionOp::blez: @@ -2350,9 +2503,8 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) const Condition condition = (cbi.instruction.op == InstructionOp::bgtz) ? Condition::Greater : Condition::LessEqual; - DoBranch(condition, lhs, Value::FromConstantU32(0), Reg::count, std::move(branch_target)); + return DoBranch(condition, lhs, Value::FromConstantU32(0), Reg::count, std::move(branch_target)); } - break; case InstructionOp::b: { @@ -2378,17 +2530,13 @@ bool CodeGenerator::Compile_Branch(const CodeBlockInstruction& cbi) m_register_cache.WriteGuestRegister(Reg::ra, CalculatePC(4)); } - DoBranch(condition, lhs, Value(), Reg::count, std::move(branch_target)); + return DoBranch(condition, lhs, Value(), Reg::count, std::move(branch_target)); } - break; default: UnreachableCode(); - break; + return false; } - - InstructionEpilogue(cbi); - return true; } bool CodeGenerator::Compile_lui(const CodeBlockInstruction& cbi) diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 37d0d3e44..49cc452c8 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -2,6 +2,7 @@ #include #include #include +#include #include "common/jit_code_buffer.h" @@ -25,6 +26,8 @@ public: static void AlignCodeBuffer(JitCodeBuffer* code_buffer); static bool BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi); + static void BackpatchBranch(void* pc, u32 pc_size, void* target); + static void BackpatchReturn(void* pc, u32 pc_size); bool CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); @@ -34,8 +37,8 @@ public: ////////////////////////////////////////////////////////////////////////// // Code Generation ////////////////////////////////////////////////////////////////////////// - void EmitBeginBlock(); - void EmitEndBlock(); + void EmitBeginBlock(bool allocate_registers = true); + void EmitEndBlock(bool free_registers = true, bool emit_return = true); void EmitExceptionExit(); void EmitExceptionExitOnBool(const Value& value); void FinalizeBlock(CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size); @@ -105,6 +108,7 @@ public: void EmitConditionalBranch(Condition condition, bool invert, HostReg lhs, const Value& rhs, LabelType* label); void EmitConditionalBranch(Condition condition, bool invert, LabelType* label); void EmitBranchIfBitClear(HostReg reg, RegSize size, u8 bit, LabelType* label); + void EmitBranchIfBitSet(HostReg reg, RegSize size, u8 bit, LabelType* label); void EmitBindLabel(LabelType* label); u32 PrepareStackForCall(); @@ -250,6 +254,7 @@ private: u32 m_pc = 0; bool m_pc_valid = false; + bool m_block_linked = false; // whether various flags need to be reset. bool m_current_instruction_in_branch_delay_slot_dirty = false; diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index 723f7ee42..178803175 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -166,31 +166,42 @@ Value CodeGenerator::GetValueInHostOrScratchRegister(const Value& value, bool al return new_value; } -void CodeGenerator::EmitBeginBlock() +void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) { m_emit->sub(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - // Save the link register, since we'll be calling functions. - const bool link_reg_allocated = m_register_cache.AllocateHostReg(14); - DebugAssert(link_reg_allocated); - UNREFERENCED_VARIABLE(link_reg_allocated); - m_register_cache.AssumeCalleeSavedRegistersAreSaved(); + if (allocate_registers) + { + // Save the link register, since we'll be calling functions. + const bool link_reg_allocated = m_register_cache.AllocateHostReg(14); + DebugAssert(link_reg_allocated); + UNREFERENCED_VARIABLE(link_reg_allocated); + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); - // Store the CPU struct pointer. TODO: make this better. - const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); - // m_emit->Mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); - DebugAssert(cpu_reg_allocated); - UNREFERENCED_VARIABLE(cpu_reg_allocated); + // Store the CPU struct pointer. TODO: make this better. + const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); + // m_emit->Mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); + DebugAssert(cpu_reg_allocated); + UNREFERENCED_VARIABLE(cpu_reg_allocated); + } } -void CodeGenerator::EmitEndBlock() +void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */) { - m_register_cache.FreeHostReg(RCPUPTR); - m_register_cache.PopCalleeSavedRegisters(true); + if (free_registers) + { + m_register_cache.FreeHostReg(RCPUPTR); + m_register_cache.FreeHostReg(14); + m_register_cache.PopCalleeSavedRegisters(true); + } m_emit->add(a32::sp, a32::sp, FUNCTION_STACK_SIZE); - // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); - m_emit->bx(a32::lr); + + if (emit_return) + { + // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); + m_emit->bx(a32::lr); + } } void CodeGenerator::EmitExceptionExit() @@ -1572,6 +1583,49 @@ bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) return true; } +void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size) +{ + Log_ProfilePrintf("Backpatching %p to return", pc); + + vixl::aarch32::MacroAssembler emit(static_cast(pc), pc_size, a32::A32); + emit.bx(a32::lr); + + const s32 nops = (static_cast(pc_size) - static_cast(emit.GetCursorOffset())) / 4; + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + emit.nop(); + + JitCodeBuffer::FlushInstructionCache(pc, pc_size); +} + +void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target) +{ + Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target); + + vixl::aarch32::MacroAssembler emit(static_cast(pc), pc_size, a32::A32); + + // check jump distance + const s32 displacement = GetPCDisplacement(pc, target); + if (!IsPCDisplacementInImmediateRange(displacement)) + { + emit.Mov(GetHostReg32(RSCRATCH), reinterpret_cast(target)); + emit.bx(GetHostReg32(RSCRATCH)); + } + else + { + a32::Label label(displacement + emit.GetCursorOffset()); + emit.b(&label); + } + + // shouldn't have any nops + const s32 nops = (static_cast(pc_size) - static_cast(emit.GetCursorOffset())) / 4; + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + emit.nop(); + + JitCodeBuffer::FlushInstructionCache(pc, pc_size); +} + void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { EmitLoadGlobalAddress(RSCRATCH, ptr); diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index c8e27dadf..1e356e0d4 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -194,41 +194,51 @@ Value CodeGenerator::GetValueInHostOrScratchRegister(const Value& value, bool al return new_value; } -void CodeGenerator::EmitBeginBlock() +void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) { m_emit->Sub(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - // Save the link register, since we'll be calling functions. - const bool link_reg_allocated = m_register_cache.AllocateHostReg(30); - DebugAssert(link_reg_allocated); - UNREFERENCED_VARIABLE(link_reg_allocated); - m_register_cache.AssumeCalleeSavedRegistersAreSaved(); - - // Store the CPU struct pointer. TODO: make this better. - const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); - DebugAssert(cpu_reg_allocated); - UNREFERENCED_VARIABLE(cpu_reg_allocated); - - // If there's loadstore instructions, preload the fastmem base. - if (m_block->contains_loadstore_instructions) + if (allocate_registers) { - const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); - Assert(fastmem_reg_allocated); - m_emit->Ldr(GetFastmemBasePtrReg(), a64::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); + // Save the link register, since we'll be calling functions. + const bool link_reg_allocated = m_register_cache.AllocateHostReg(30); + DebugAssert(link_reg_allocated); + UNREFERENCED_VARIABLE(link_reg_allocated); + + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); + + // Store the CPU struct pointer. TODO: make this better. + const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); + DebugAssert(cpu_reg_allocated); + UNREFERENCED_VARIABLE(cpu_reg_allocated); + + // If there's loadstore instructions, preload the fastmem base. + if (m_block->contains_loadstore_instructions) + { + const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); + Assert(fastmem_reg_allocated); + m_emit->Ldr(GetFastmemBasePtrReg(), a64::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); + } } } -void CodeGenerator::EmitEndBlock() +void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */) { - if (m_block->contains_loadstore_instructions) - m_register_cache.FreeHostReg(RMEMBASEPTR); + if (free_registers) + { + if (m_block->contains_loadstore_instructions) + m_register_cache.FreeHostReg(RMEMBASEPTR); - m_register_cache.FreeHostReg(RCPUPTR); - m_register_cache.PopCalleeSavedRegisters(true); + m_register_cache.FreeHostReg(RCPUPTR); + m_register_cache.FreeHostReg(30); // lr + + m_register_cache.PopCalleeSavedRegisters(true); + } m_emit->Add(a64::sp, a64::sp, FUNCTION_STACK_SIZE); - // m_emit->b(GetPCDisplacement(GetCurrentCodePointer(), s_dispatcher_return_address)); - m_emit->Ret(); + + if (emit_return) + m_emit->Ret(); } void CodeGenerator::EmitExceptionExit() @@ -1767,6 +1777,42 @@ bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) return true; } +void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size) +{ + Log_ProfilePrintf("Backpatching %p to return", pc); + + vixl::aarch64::MacroAssembler emit(static_cast(pc), pc_size, a64::PositionDependentCode); + emit.ret(); + + const s32 nops = (static_cast(pc_size) - static_cast(emit.GetCursorOffset())) / 4; + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + emit.nop(); + + JitCodeBuffer::FlushInstructionCache(pc, pc_size); +} + +void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target) +{ + Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target); + + // check jump distance + const s64 jump_distance = static_cast(reinterpret_cast(target) - reinterpret_cast(pc)); + Assert(Common::IsAligned(jump_distance, 4)); + Assert(a64::Instruction::IsValidImmPCOffset(a64::UncondBranchType, jump_distance >> 2)); + + vixl::aarch64::MacroAssembler emit(static_cast(pc), pc_size, a64::PositionDependentCode); + emit.b(jump_distance >> 2); + + // shouldn't have any nops + const s32 nops = (static_cast(pc_size) - static_cast(emit.GetCursorOffset())) / 4; + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + emit.nop(); + + JitCodeBuffer::FlushInstructionCache(pc, pc_size); +} + void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { EmitLoadGlobalAddress(RSCRATCH, ptr); diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 99d374118..647de1409 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -206,35 +206,42 @@ Value CodeGenerator::GetValueInHostOrScratchRegister(const Value& value, bool al return new_value; } -void CodeGenerator::EmitBeginBlock() +void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) { - m_register_cache.AssumeCalleeSavedRegistersAreSaved(); - - // Store the CPU struct pointer. - const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); - DebugAssert(cpu_reg_allocated); - UNREFERENCED_VARIABLE(cpu_reg_allocated); - // m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); - - // If there's loadstore instructions, preload the fastmem base. - if (m_block->contains_loadstore_instructions) + if (allocate_registers) { - const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); - Assert(fastmem_reg_allocated); - UNREFERENCED_VARIABLE(fastmem_reg_allocated); - m_emit->mov(GetFastmemBasePtrReg(), m_emit->qword[GetCPUPtrReg() + offsetof(CPU::State, fastmem_base)]); + m_register_cache.AssumeCalleeSavedRegistersAreSaved(); + + // Store the CPU struct pointer. + const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR); + DebugAssert(cpu_reg_allocated); + UNREFERENCED_VARIABLE(cpu_reg_allocated); + // m_emit->mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); + + // If there's loadstore instructions, preload the fastmem base. + if (m_block->contains_loadstore_instructions) + { + const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); + DebugAssert(fastmem_reg_allocated); + UNREFERENCED_VARIABLE(fastmem_reg_allocated); + m_emit->mov(GetFastmemBasePtrReg(), m_emit->qword[GetCPUPtrReg() + offsetof(CPU::State, fastmem_base)]); + } } } -void CodeGenerator::EmitEndBlock() +void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, bool emit_return /* = true */) { - m_register_cache.FreeHostReg(RCPUPTR); - if (m_block->contains_loadstore_instructions) - m_register_cache.FreeHostReg(RMEMBASEPTR); + if (free_registers) + { + m_register_cache.FreeHostReg(RCPUPTR); + if (m_block->contains_loadstore_instructions) + m_register_cache.FreeHostReg(RMEMBASEPTR); - m_register_cache.PopCalleeSavedRegisters(true); + m_register_cache.PopCalleeSavedRegisters(true); + } - m_emit->ret(); + if (emit_return) + m_emit->ret(); } void CodeGenerator::EmitExceptionExit() @@ -2336,7 +2343,7 @@ void CodeGenerator::EmitUpdateFastmemBase() bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) { - Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem", lbi.host_pc, lbi.guest_pc); + Log_ProfilePrintf("Backpatching %p (guest PC 0x%08X) to slowmem", lbi.host_pc, lbi.guest_pc); // turn it into a jump to the slowmem handler Xbyak::CodeGenerator cg(lbi.host_code_size, lbi.host_pc); @@ -2352,6 +2359,39 @@ bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi) return true; } +void CodeGenerator::BackpatchReturn(void* pc, u32 pc_size) +{ + Log_ProfilePrintf("Backpatching %p to return", pc); + + Xbyak::CodeGenerator cg(pc_size, pc); + cg.ret(); + + const s32 nops = + static_cast(pc_size) - static_cast(static_cast(cg.getCurr() - static_cast(pc))); + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + cg.nop(); + + JitCodeBuffer::FlushInstructionCache(pc, pc_size); +} + +void CodeGenerator::BackpatchBranch(void* pc, u32 pc_size, void* target) +{ + Log_ProfilePrintf("Backpatching %p to %p [branch]", pc, target); + + Xbyak::CodeGenerator cg(pc_size, pc); + cg.jmp(target); + + // shouldn't have any nops + const s32 nops = + static_cast(pc_size) - static_cast(static_cast(cg.getCurr() - static_cast(pc))); + Assert(nops >= 0); + for (s32 i = 0; i < nops; i++) + cg.nop(); + + JitCodeBuffer::FlushInstructionCache(pc, pc_size); +} + void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr) { const s64 displacement = @@ -2851,6 +2891,59 @@ void CodeGenerator::EmitConditionalBranch(Condition condition, bool invert, Labe } } +void CodeGenerator::EmitBranchIfBitSet(HostReg reg, RegSize size, u8 bit, LabelType* label) +{ + if (bit < 8) + { + // same size, probably faster + switch (size) + { + case RegSize_8: + m_emit->test(GetHostReg8(reg), (1u << bit)); + m_emit->jnz(*label); + break; + + case RegSize_16: + m_emit->test(GetHostReg16(reg), (1u << bit)); + m_emit->jnz(*label); + break; + + case RegSize_32: + m_emit->test(GetHostReg32(reg), (1u << bit)); + m_emit->jnz(*label); + break; + + default: + UnreachableCode(); + break; + } + } + else + { + switch (size) + { + case RegSize_8: + m_emit->bt(GetHostReg8(reg), bit); + m_emit->jc(*label); + break; + + case RegSize_16: + m_emit->bt(GetHostReg16(reg), bit); + m_emit->jc(*label); + break; + + case RegSize_32: + m_emit->bt(GetHostReg32(reg), bit); + m_emit->jc(*label); + break; + + default: + UnreachableCode(); + break; + } + } +} + void CodeGenerator::EmitBranchIfBitClear(HostReg reg, RegSize size, u8 bit, LabelType* label) { if (bit < 8) diff --git a/src/core/cpu_recompiler_register_cache.cpp b/src/core/cpu_recompiler_register_cache.cpp index 3aae2d167..8c089b917 100644 --- a/src/core/cpu_recompiler_register_cache.cpp +++ b/src/core/cpu_recompiler_register_cache.cpp @@ -280,6 +280,21 @@ Value RegisterCache::AllocateScratch(RegSize size, HostReg reg /* = HostReg_Inva return Value::FromScratch(this, reg, size); } +void RegisterCache::ReserveCallerSavedRegisters() +{ + for (u32 reg = 0; reg < HostReg_Count; reg++) + { + if ((m_state.host_reg_state[reg] & (HostRegState::CalleeSaved | HostRegState::CalleeSavedAllocated)) == + HostRegState::CalleeSaved) + { + DebugAssert(m_state.callee_saved_order_count < HostReg_Count); + m_code_generator.EmitPushHostReg(static_cast(reg), GetActiveCalleeSavedRegisterCount()); + m_state.callee_saved_order[m_state.callee_saved_order_count++] = static_cast(reg); + m_state.host_reg_state[reg] |= HostRegState::CalleeSavedAllocated; + } + } +} + u32 RegisterCache::PushCallerSavedRegisters() const { u32 position = GetActiveCalleeSavedRegisterCount(); diff --git a/src/core/cpu_recompiler_register_cache.h b/src/core/cpu_recompiler_register_cache.h index 4863f318d..0492a4e5e 100644 --- a/src/core/cpu_recompiler_register_cache.h +++ b/src/core/cpu_recompiler_register_cache.h @@ -193,6 +193,16 @@ struct Value static Value FromConstantU32(u32 value) { return FromConstant(ZeroExtend64(value), RegSize_32); } static Value FromConstantS32(s32 value) { return FromConstant(ZeroExtend64(static_cast(value)), RegSize_32); } static Value FromConstantU64(u64 value) { return FromConstant(value, RegSize_64); } + static Value FromConstantPtr(const void* pointer) + { +#if defined(CPU_AARCH64) || defined(CPU_X64) + return FromConstant(static_cast(reinterpret_cast(pointer)), RegSize_64); +#elif defined(CPU_AARCH32) + return FromConstant(static_cast(reinterpret_cast(pointer)), RegSize_32); +#else + return FromConstant(0, RegSize_32); +#endif + } private: void Release(); @@ -241,6 +251,9 @@ public: /// Ensures a host register is free, removing any value cached. void EnsureHostRegFree(HostReg reg); + /// Preallocates caller saved registers, enabling later use without stack pushes. + void ReserveCallerSavedRegisters(); + /// Push/pop volatile host registers. Returns the number of registers pushed/popped. u32 PushCallerSavedRegisters() const; u32 PopCallerSavedRegisters() const; diff --git a/src/core/cpu_recompiler_thunks.h b/src/core/cpu_recompiler_thunks.h index 1a206e0c3..bfddb5d94 100644 --- a/src/core/cpu_recompiler_thunks.h +++ b/src/core/cpu_recompiler_thunks.h @@ -3,6 +3,7 @@ #include "cpu_types.h" namespace CPU { +struct CodeBlock; struct CodeBlockInstruction; namespace Recompiler::Thunks { @@ -32,6 +33,9 @@ void UncheckedWriteMemoryByte(u32 address, u32 value); void UncheckedWriteMemoryHalfWord(u32 address, u32 value); void UncheckedWriteMemoryWord(u32 address, u32 value); +void ResolveBranch(CodeBlock* block, void* host_pc, void* host_resolve_pc, u32 host_pc_size); +void LogPC(u32 pc); + } // namespace Recompiler::Thunks } // namespace CPU diff --git a/src/core/cpu_types.cpp b/src/core/cpu_types.cpp index aa0cc548f..ef534b079 100644 --- a/src/core/cpu_types.cpp +++ b/src/core/cpu_types.cpp @@ -104,24 +104,25 @@ bool IsDirectBranchInstruction(const Instruction& instruction) } } -u32 GetBranchInstructionTarget(const Instruction& instruction, u32 instruction_pc) +VirtualMemoryAddress GetDirectBranchTarget(const Instruction& instruction, VirtualMemoryAddress instruction_pc) { + const VirtualMemoryAddress pc = instruction_pc + 4; + switch (instruction.op) { case InstructionOp::j: case InstructionOp::jal: - return ((instruction_pc + 4) & UINT32_C(0xF0000000)) | (instruction.j.target << 2); + return (pc & UINT32_C(0xF0000000)) | (instruction.j.target << 2); case InstructionOp::b: case InstructionOp::beq: case InstructionOp::bgtz: case InstructionOp::blez: case InstructionOp::bne: - return instruction_pc + 4 + (instruction.i.imm_sext32() << 2); + return (pc + (instruction.i.imm_sext32() << 2)); default: - Panic("Trying to get branch target of indirect or invalid branch"); - return instruction_pc; + return pc; } } diff --git a/src/core/cpu_types.h b/src/core/cpu_types.h index f1fa878c4..6c0f19df2 100644 --- a/src/core/cpu_types.h +++ b/src/core/cpu_types.h @@ -223,7 +223,7 @@ bool IsNopInstruction(const Instruction& instruction); bool IsBranchInstruction(const Instruction& instruction); bool IsUnconditionalBranchInstruction(const Instruction& instruction); bool IsDirectBranchInstruction(const Instruction& instruction); -u32 GetBranchInstructionTarget(const Instruction& instruction, u32 instruction_pc); +VirtualMemoryAddress GetDirectBranchTarget(const Instruction& instruction, VirtualMemoryAddress instruction_pc); bool IsCallInstruction(const Instruction& instruction); bool IsReturnInstruction(const Instruction& instruction); bool IsMemoryLoadInstruction(const Instruction& instruction); diff --git a/src/core/host_interface.cpp b/src/core/host_interface.cpp index e7d00b62f..09c3ae52a 100644 --- a/src/core/host_interface.cpp +++ b/src/core/host_interface.cpp @@ -511,6 +511,7 @@ void HostInterface::SetDefaultSettings(SettingsInterface& si) si.SetIntValue("CPU", "OverclockDenominator", 1); si.SetBoolValue("CPU", "OverclockEnable", false); si.SetBoolValue("CPU", "RecompilerMemoryExceptions", false); + si.SetBoolValue("CPU", "RecompilerBlockLinking", true); si.SetBoolValue("CPU", "ICache", false); si.SetBoolValue("CPU", "FastmemMode", Settings::GetCPUFastmemModeName(Settings::DEFAULT_CPU_FASTMEM_MODE)); @@ -772,24 +773,15 @@ void HostInterface::CheckForSettingsChanges(const Settings& old_settings) } if (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler && - g_settings.cpu_recompiler_memory_exceptions != old_settings.cpu_recompiler_memory_exceptions) + (g_settings.cpu_recompiler_memory_exceptions != old_settings.cpu_recompiler_memory_exceptions || + g_settings.cpu_recompiler_block_linking != old_settings.cpu_recompiler_block_linking || + g_settings.cpu_recompiler_icache != old_settings.cpu_recompiler_icache)) { - AddOSDMessage(g_settings.cpu_recompiler_memory_exceptions ? - TranslateStdString("OSDMessage", "CPU memory exceptions enabled, flushing all blocks.") : - TranslateStdString("OSDMessage", "CPU memory exceptions disabled, flushing all blocks."), - 5.0f); + AddOSDMessage(TranslateStdString("OSDMessage", "Recompiler options changed, flushing all blocks."), 5.0f); CPU::CodeCache::Flush(); - } - if (g_settings.cpu_execution_mode != CPUExecutionMode::Interpreter && - g_settings.cpu_recompiler_icache != old_settings.cpu_recompiler_icache) - { - AddOSDMessage(g_settings.cpu_recompiler_icache ? - TranslateStdString("OSDMessage", "CPU ICache enabled, flushing all blocks.") : - TranslateStdString("OSDMessage", "CPU ICache disabled, flushing all blocks."), - 5.0f); - CPU::CodeCache::Flush(); - CPU::ClearICache(); + if (g_settings.cpu_recompiler_icache != old_settings.cpu_recompiler_icache) + CPU::ClearICache(); } m_audio_stream->SetOutputVolume(GetAudioOutputVolume()); diff --git a/src/core/settings.cpp b/src/core/settings.cpp index f83fb23ff..a329415f5 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -172,6 +172,7 @@ void Settings::Load(SettingsInterface& si) cpu_overclock_enable = si.GetBoolValue("CPU", "OverclockEnable", false); UpdateOverclockActive(); cpu_recompiler_memory_exceptions = si.GetBoolValue("CPU", "RecompilerMemoryExceptions", false); + cpu_recompiler_block_linking = si.GetBoolValue("CPU", "RecompilerBlockLinking", true); cpu_recompiler_icache = si.GetBoolValue("CPU", "RecompilerICache", false); cpu_fastmem_mode = ParseCPUFastmemMode( si.GetStringValue("CPU", "FastmemMode", GetCPUFastmemModeName(DEFAULT_CPU_FASTMEM_MODE)).c_str()) @@ -363,6 +364,7 @@ void Settings::Save(SettingsInterface& si) const si.SetIntValue("CPU", "OverclockNumerator", cpu_overclock_numerator); si.SetIntValue("CPU", "OverclockDenominator", cpu_overclock_denominator); si.SetBoolValue("CPU", "RecompilerMemoryExceptions", cpu_recompiler_memory_exceptions); + si.SetBoolValue("CPU", "RecompilerBlockLinking", cpu_recompiler_block_linking); si.SetBoolValue("CPU", "RecompilerICache", cpu_recompiler_icache); si.SetStringValue("CPU", "FastmemMode", GetCPUFastmemModeName(cpu_fastmem_mode)); diff --git a/src/core/settings.h b/src/core/settings.h index e26828ad9..b71003b70 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -78,6 +78,7 @@ struct Settings bool cpu_overclock_enable = false; bool cpu_overclock_active = false; bool cpu_recompiler_memory_exceptions = false; + bool cpu_recompiler_block_linking = true; bool cpu_recompiler_icache = false; CPUFastmemMode cpu_fastmem_mode = CPUFastmemMode::Disabled; diff --git a/src/duckstation-qt/advancedsettingswidget.cpp b/src/duckstation-qt/advancedsettingswidget.cpp index cd1cc383f..c0df26f28 100644 --- a/src/duckstation-qt/advancedsettingswidget.cpp +++ b/src/duckstation-qt/advancedsettingswidget.cpp @@ -159,6 +159,8 @@ AdvancedSettingsWidget::AdvancedSettingsWidget(QtHostInterface* host_interface, addBooleanTweakOption(m_host_interface, m_ui.tweakOptionTable, tr("Enable Recompiler Memory Exceptions"), "CPU", "RecompilerMemoryExceptions", false); + addBooleanTweakOption(m_host_interface, m_ui.tweakOptionTable, tr("Enable Recompiler Block Linking"), "CPU", + "RecompilerBlockLinking", true); addChoiceTweakOption(m_host_interface, m_ui.tweakOptionTable, tr("Enable Recompiler Fast Memory Access"), "CPU", "FastmemMode", Settings::ParseCPUFastmemMode, Settings::GetCPUFastmemModeName, Settings::GetCPUFastmemModeDisplayName, "CPUFastmemMode", @@ -226,20 +228,21 @@ void AdvancedSettingsWidget::onResetToDefaultClicked() setFloatRangeTweakOption(m_ui.tweakOptionTable, 4, -1.0f); setFloatRangeTweakOption(m_ui.tweakOptionTable, 5, Settings::DEFAULT_GPU_PGXP_DEPTH_THRESHOLD); setBooleanTweakOption(m_ui.tweakOptionTable, 6, false); - setChoiceTweakOption(m_ui.tweakOptionTable, 7, Settings::DEFAULT_CPU_FASTMEM_MODE); - setBooleanTweakOption(m_ui.tweakOptionTable, 8, false); + setBooleanTweakOption(m_ui.tweakOptionTable, 7, true); + setChoiceTweakOption(m_ui.tweakOptionTable, 8, Settings::DEFAULT_CPU_FASTMEM_MODE); setBooleanTweakOption(m_ui.tweakOptionTable, 9, false); setBooleanTweakOption(m_ui.tweakOptionTable, 10, false); setBooleanTweakOption(m_ui.tweakOptionTable, 11, false); setBooleanTweakOption(m_ui.tweakOptionTable, 12, false); - setIntRangeTweakOption(m_ui.tweakOptionTable, 13, Settings::DEFAULT_VRAM_WRITE_DUMP_WIDTH_THRESHOLD); - setIntRangeTweakOption(m_ui.tweakOptionTable, 14, Settings::DEFAULT_VRAM_WRITE_DUMP_HEIGHT_THRESHOLD); - setIntRangeTweakOption(m_ui.tweakOptionTable, 15, static_cast(Settings::DEFAULT_DMA_MAX_SLICE_TICKS)); - setIntRangeTweakOption(m_ui.tweakOptionTable, 16, static_cast(Settings::DEFAULT_DMA_HALT_TICKS)); - setIntRangeTweakOption(m_ui.tweakOptionTable, 17, static_cast(Settings::DEFAULT_GPU_FIFO_SIZE)); - setIntRangeTweakOption(m_ui.tweakOptionTable, 18, static_cast(Settings::DEFAULT_GPU_MAX_RUN_AHEAD)); - setBooleanTweakOption(m_ui.tweakOptionTable, 19, false); - setBooleanTweakOption(m_ui.tweakOptionTable, 20, true); - setBooleanTweakOption(m_ui.tweakOptionTable, 21, false); + setBooleanTweakOption(m_ui.tweakOptionTable, 13, false); + setIntRangeTweakOption(m_ui.tweakOptionTable, 14, Settings::DEFAULT_VRAM_WRITE_DUMP_WIDTH_THRESHOLD); + setIntRangeTweakOption(m_ui.tweakOptionTable, 15, Settings::DEFAULT_VRAM_WRITE_DUMP_HEIGHT_THRESHOLD); + setIntRangeTweakOption(m_ui.tweakOptionTable, 16, static_cast(Settings::DEFAULT_DMA_MAX_SLICE_TICKS)); + setIntRangeTweakOption(m_ui.tweakOptionTable, 17, static_cast(Settings::DEFAULT_DMA_HALT_TICKS)); + setIntRangeTweakOption(m_ui.tweakOptionTable, 18, static_cast(Settings::DEFAULT_GPU_FIFO_SIZE)); + setIntRangeTweakOption(m_ui.tweakOptionTable, 19, static_cast(Settings::DEFAULT_GPU_MAX_RUN_AHEAD)); + setBooleanTweakOption(m_ui.tweakOptionTable, 20, false); + setBooleanTweakOption(m_ui.tweakOptionTable, 21, true); setBooleanTweakOption(m_ui.tweakOptionTable, 22, false); + setBooleanTweakOption(m_ui.tweakOptionTable, 23, false); } diff --git a/src/frontend-common/fullscreen_ui.cpp b/src/frontend-common/fullscreen_ui.cpp index ae068a213..31e4cc4b8 100644 --- a/src/frontend-common/fullscreen_ui.cpp +++ b/src/frontend-common/fullscreen_ui.cpp @@ -2588,6 +2588,10 @@ void DrawSettingsWindow() settings_changed |= ToggleButton("Enable Recompiler Memory Exceptions", "Enables alignment and bus exceptions. Not needed for any known games.", &s_settings_copy.cpu_recompiler_memory_exceptions); + settings_changed |= ToggleButton( + "Enable Recompiler Block Linking", + "Performance enhancement - jumps directly between blocks instead of returning to the dispatcher.", + &s_settings_copy.cpu_recompiler_block_linking); settings_changed |= EnumChoiceButton("Recompiler Fast Memory Access", "Avoids calls to C++ code, significantly speeding up the recompiler.", &s_settings_copy.cpu_fastmem_mode, &Settings::GetCPUFastmemModeDisplayName, @@ -3902,6 +3906,8 @@ void DrawDebugSettingsMenu() settings_changed |= ImGui::MenuItem("Recompiler Memory Exceptions", nullptr, &s_settings_copy.cpu_recompiler_memory_exceptions); + settings_changed |= + ImGui::MenuItem("Recompiler Block Linking", nullptr, &s_settings_copy.cpu_recompiler_block_linking); if (ImGui::BeginMenu("Recompiler Fastmem")) { for (u32 i = 0; i < static_cast(CPUFastmemMode::Count); i++)