From 30db081a645481cb40b74730ba3f5dcfe12e19c4 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Thu, 15 Jul 2021 21:57:52 +1000 Subject: [PATCH] CPU: Simulate stalls from GTE instructions --- src/core/cpu_core.cpp | 11 ++++ src/core/cpu_core.h | 5 +- src/core/cpu_core_private.h | 11 ++++ src/core/cpu_recompiler_code_generator.cpp | 65 ++++++++++++++++++- src/core/cpu_recompiler_code_generator.h | 5 ++ .../cpu_recompiler_code_generator_aarch32.cpp | 18 +++++ .../cpu_recompiler_code_generator_aarch64.cpp | 17 +++++ .../cpu_recompiler_code_generator_generic.cpp | 31 +++++++++ .../cpu_recompiler_code_generator_x64.cpp | 16 +++++ src/core/gte.cpp | 48 +++++++++++++- src/core/gte.h | 2 +- 11 files changed, 223 insertions(+), 6 deletions(-) diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 5c6442e92..826c3c50d 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -181,7 +181,10 @@ bool DoState(StateWrapper& sw) } if (sw.IsReading()) + { UpdateFastmemBase(); + g_state.gte_completion_tick = 0; + } return !sw.HasError(); } @@ -1462,6 +1465,8 @@ restart_instruction: return; } + StallUntilGTEComplete(); + if (inst.cop.IsCommonInstruction()) { // TODO: Combine with cop0. @@ -1533,6 +1538,7 @@ restart_instruction: if (!ReadMemoryWord(addr, &value)) return; + StallUntilGTEComplete(); GTE::WriteRegister(ZeroExtend32(static_cast(inst.i.rt.GetValue())), value); if constexpr (pgxp_mode >= PGXPMode::Memory) @@ -1549,6 +1555,8 @@ restart_instruction: return; } + StallUntilGTEComplete(); + const VirtualMemoryAddress addr = ReadReg(inst.i.rs) + inst.i.imm_sext32(); const u32 value = GTE::ReadRegister(ZeroExtend32(static_cast(inst.i.rt.GetValue()))); WriteMemoryWord(addr, value); @@ -1596,7 +1604,10 @@ void DispatchInterrupt() // instruction. For some reason, if we don't do this, we end up with incorrectly sorted polygons and flickering.. SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits); if (g_state.next_instruction.op == InstructionOp::cop2 && !g_state.next_instruction.cop.IsCommonInstruction()) + { + StallUntilGTEComplete(); GTE::ExecuteInstruction(g_state.next_instruction.bits); + } // Interrupt raising occurs before the start of the instruction. RaiseException( diff --git a/src/core/cpu_core.h b/src/core/cpu_core.h index 288a899fa..4595b085a 100644 --- a/src/core/cpu_core.h +++ b/src/core/cpu_core.h @@ -46,8 +46,9 @@ union CacheControl struct State { // ticks the CPU has executed - TickCount pending_ticks = 0; TickCount downcount = 0; + TickCount pending_ticks = 0; + TickCount gte_completion_tick = 0; Registers regs = {}; Cop0Registers cop0_regs = {}; @@ -118,6 +119,8 @@ ALWAYS_INLINE TickCount GetPendingTicks() } ALWAYS_INLINE void ResetPendingTicks() { + g_state.gte_completion_tick = + (g_state.pending_ticks < g_state.gte_completion_tick) ? (g_state.gte_completion_tick - g_state.pending_ticks) : 0; g_state.pending_ticks = 0; } ALWAYS_INLINE void AddPendingTicks(TickCount ticks) diff --git a/src/core/cpu_core_private.h b/src/core/cpu_core_private.h index fac106b0d..1be1b13a7 100644 --- a/src/core/cpu_core_private.h +++ b/src/core/cpu_core_private.h @@ -111,4 +111,15 @@ bool WriteMemoryWord(VirtualMemoryAddress addr, u32 value); void* GetDirectReadMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize size, TickCount* read_ticks); void* GetDirectWriteMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize size); +ALWAYS_INLINE void AddGTETicks(TickCount ticks) +{ + g_state.gte_completion_tick = g_state.pending_ticks + ticks + 1; +} + +ALWAYS_INLINE void StallUntilGTEComplete() +{ + g_state.pending_ticks = + (g_state.gte_completion_tick > g_state.pending_ticks) ? g_state.gte_completion_tick : g_state.pending_ticks; +} + } // namespace CPU \ No newline at end of file diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index 6ca207450..c033270e8 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -964,6 +964,7 @@ void CodeGenerator::BlockPrologue() m_branch_was_taken_dirty = g_settings.cpu_recompiler_memory_exceptions; m_current_instruction_was_branch_taken_dirty = false; m_load_delay_dirty = true; + m_gte_busy_cycles_dirty = true; m_pc_offset = 0; m_current_instruction_pc_offset = 0; @@ -1067,13 +1068,63 @@ void CodeGenerator::TruncateBlockAtCurrentInstruction() void CodeGenerator::AddPendingCycles(bool commit) { - if (m_delayed_cycles_add == 0) + if (m_delayed_cycles_add == 0 && m_gte_done_cycle <= m_delayed_cycles_add) return; - EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(m_delayed_cycles_add)); + if (m_gte_done_cycle > m_delayed_cycles_add) + { + Value temp = m_register_cache.AllocateScratch(RegSize_32); + EmitLoadCPUStructField(temp.GetHostRegister(), RegSize_32, offsetof(State, pending_ticks)); + if (m_delayed_cycles_add > 0) + { + EmitAdd(temp.GetHostRegister(), temp.GetHostRegister(), Value::FromConstantU32(m_delayed_cycles_add), false); + EmitStoreCPUStructField(offsetof(State, pending_ticks), temp); + EmitAdd(temp.GetHostRegister(), temp.GetHostRegister(), + Value::FromConstantU32(m_gte_done_cycle - m_delayed_cycles_add), false); + EmitStoreCPUStructField(offsetof(State, gte_completion_tick), temp); + } + else + { + EmitAdd(temp.GetHostRegister(), temp.GetHostRegister(), Value::FromConstantU32(m_gte_done_cycle), false); + EmitStoreCPUStructField(offsetof(State, gte_completion_tick), temp); + } + } + else + { + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(m_delayed_cycles_add)); + } if (commit) + { + m_gte_done_cycle = std::max(m_gte_done_cycle - m_delayed_cycles_add, 0); m_delayed_cycles_add = 0; + } +} + +void CodeGenerator::AddGTETicks(TickCount ticks) +{ + m_gte_done_cycle = m_delayed_cycles_add + ticks; + Log_DebugPrintf("Adding %d GTE ticks", ticks); +} + +void CodeGenerator::StallUntilGTEComplete() +{ + if (!m_gte_busy_cycles_dirty) + { + // simple case - in block scheduling + if (m_gte_done_cycle > m_delayed_cycles_add) + { + Log_DebugPrintf("Stalling for %d ticks from GTE", m_gte_done_cycle - m_delayed_cycles_add); + m_delayed_cycles_add += (m_gte_done_cycle - m_delayed_cycles_add); + } + + return; + } + + // switch to in block scheduling + EmitStallUntilGTEComplete(); + m_gte_done_cycle = 0; + m_gte_busy_cycles_dirty = false; } Value CodeGenerator::CalculatePC(u32 offset /* = 0 */) @@ -2740,6 +2791,7 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi) { if (cbi.instruction.op == InstructionOp::lwc2 || cbi.instruction.op == InstructionOp::swc2) { + StallUntilGTEComplete(); InstructionPrologue(cbi, 1); const u32 reg = static_cast(cbi.instruction.i.rt.GetValue()); @@ -2786,6 +2838,7 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi) const u32 reg = static_cast(cbi.instruction.r.rd.GetValue()) + ((cbi.instruction.cop.CommonOp() == CopCommonInstruction::cfcn) ? 32 : 0); + StallUntilGTEComplete(); InstructionPrologue(cbi, 1); Value value = DoGTERegisterRead(reg); @@ -2811,6 +2864,7 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi) const u32 reg = static_cast(cbi.instruction.r.rd.GetValue()) + ((cbi.instruction.cop.CommonOp() == CopCommonInstruction::ctcn) ? 32 : 0); + StallUntilGTEComplete(); InstructionPrologue(cbi, 1); Value value = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt); @@ -2833,11 +2887,16 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi) } else { + TickCount func_ticks; + GTE::InstructionImpl func = GTE::GetInstructionImpl(cbi.instruction.bits, &func_ticks); + // forward everything to the GTE. + StallUntilGTEComplete(); InstructionPrologue(cbi, 1); Value instruction_bits = Value::FromConstantU32(cbi.instruction.bits & GTE::Instruction::REQUIRED_BITS_MASK); - EmitFunctionCall(nullptr, GTE::GetInstructionImpl(cbi.instruction.bits), instruction_bits); + EmitFunctionCall(nullptr, func, instruction_bits); + AddGTETicks(func_ticks); InstructionEpilogue(cbi); return true; diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index 5550cfee1..476efbf34 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -71,6 +71,7 @@ public: void EmitMoveNextInterpreterLoadDelay(); void EmitCancelInterpreterLoadDelayForReg(Reg reg); void EmitICacheCheckAndUpdate(); + void EmitStallUntilGTEComplete(); void EmitLoadCPUStructField(HostReg host_reg, RegSize size, u32 offset); void EmitStoreCPUStructField(u32 offset, const Value& value); void EmitAddCPUStructField(u32 offset, const Value& value); @@ -200,6 +201,8 @@ private: void InstructionEpilogue(const CodeBlockInstruction& cbi); void TruncateBlockAtCurrentInstruction(); void AddPendingCycles(bool commit); + void AddGTETicks(TickCount ticks); + void StallUntilGTEComplete(); Value CalculatePC(u32 offset = 0); Value GetCurrentInstructionPC(u32 offset = 0); @@ -244,6 +247,7 @@ private: CodeEmitter* m_emit; TickCount m_delayed_cycles_add = 0; + TickCount m_gte_done_cycle = 0; TickCount m_pc_offset = 0; TickCount m_current_instruction_pc_offset = 0; TickCount m_next_pc_offset = 0; @@ -254,6 +258,7 @@ private: bool m_current_instruction_was_branch_taken_dirty = false; bool m_load_delay_dirty = false; bool m_next_load_delay_dirty = false; + bool m_gte_busy_cycles_dirty = false; bool m_fastmem_load_base_in_register = false; bool m_fastmem_store_base_in_register = false; diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index 260ee8a74..3ab3014c0 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -1695,6 +1695,24 @@ void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg) m_emit->Bind(&skip_cancel); } +void CodeGenerator::EmitStallUntilGTEComplete() +{ + static_assert(offsetof(State, pending_ticks) + sizeof(u32) == offsetof(State, gte_completion_tick)); + + m_emit->ldr(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), offsetof(State, pending_ticks))); + m_emit->ldr(GetHostReg32(RARG2), a32::MemOperand(GetCPUPtrReg(), offsetof(State, gte_completion_tick))); + + if (m_delayed_cycles_add > 0) + { + m_emit->Add(GetHostReg32(RARG1), GetHostReg32(RARG1), static_cast(m_delayed_cycles_add)); + m_delayed_cycles_add = 0; + } + + m_emit->cmp(GetHostReg32(RARG2), GetHostReg32(RARG1)); + m_emit->mov(a32::hi, GetHostReg32(RARG1), GetHostReg32(RARG2)); + m_emit->str(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), offsetof(State, pending_ticks))); +} + void CodeGenerator::EmitBranch(const void* address, bool allow_scratch) { const s32 displacement = GetPCDisplacement(GetCurrentCodePointer(), address); diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 5d57aa56e..76f542637 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -1890,6 +1890,23 @@ void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg) m_emit->Bind(&skip_cancel); } +void CodeGenerator::EmitStallUntilGTEComplete() +{ + static_assert(offsetof(State, pending_ticks) + sizeof(u32) == offsetof(State, gte_completion_tick)); + m_emit->ldp(GetHostReg32(RARG1), GetHostReg32(RARG2), + a64::MemOperand(GetCPUPtrReg(), offsetof(State, pending_ticks))); + + if (m_delayed_cycles_add > 0) + { + m_emit->Add(GetHostReg32(RARG1), GetHostReg32(RARG1), static_cast(m_delayed_cycles_add)); + m_delayed_cycles_add = 0; + } + + m_emit->cmp(GetHostReg32(RARG2), GetHostReg32(RARG1)); + m_emit->csel(GetHostReg32(RARG1), GetHostReg32(RARG2), GetHostReg32(RARG1), a64::Condition::hi); + m_emit->str(GetHostReg32(RARG1), a64::MemOperand(GetCPUPtrReg(), offsetof(State, pending_ticks))); +} + void CodeGenerator::EmitBranch(const void* address, bool allow_scratch) { const s64 jump_distance = diff --git a/src/core/cpu_recompiler_code_generator_generic.cpp b/src/core/cpu_recompiler_code_generator_generic.cpp index 2a14b6e5f..945fb8204 100644 --- a/src/core/cpu_recompiler_code_generator_generic.cpp +++ b/src/core/cpu_recompiler_code_generator_generic.cpp @@ -207,4 +207,35 @@ void CodeGenerator::EmitICacheCheckAndUpdate() #endif +#if 0 // Not Used + +void CodeGenerator::EmitStallUntilGTEComplete() +{ + Value pending_ticks = m_register_cache.AllocateScratch(RegSize_32); + Value gte_completion_tick = m_register_cache.AllocateScratch(RegSize_32); + EmitLoadCPUStructField(pending_ticks.GetHostRegister(), RegSize_32, offsetof(State, pending_ticks)); + EmitLoadCPUStructField(gte_completion_tick.GetHostRegister(), RegSize_32, offsetof(State, gte_completion_tick)); + + // commit cycles here, should always be nonzero + if (m_delayed_cycles_add > 0) + { + EmitAdd(pending_ticks.GetHostRegister(), pending_ticks.GetHostRegister(), + Value::FromConstantU32(m_delayed_cycles_add), false); + m_delayed_cycles_add = 0; + } + + LabelType gte_done; + EmitSub(gte_completion_tick.GetHostRegister(), gte_completion_tick.GetHostRegister(), pending_ticks, true); + EmitConditionalBranch(Condition::Below, false, >e_done); + + // add stall ticks + EmitAdd(pending_ticks.GetHostRegister(), pending_ticks.GetHostRegister(), gte_completion_tick, false); + + // store new ticks + EmitBindLabel(>e_done); + EmitStoreCPUStructField(offsetof(State, pending_ticks), pending_ticks); +} + +#endif + } // namespace CPU::Recompiler diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 44592435a..746282fb1 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -2656,6 +2656,22 @@ void CodeGenerator::EmitICacheCheckAndUpdate() m_register_cache.UninhibitAllocation(); } +void CodeGenerator::EmitStallUntilGTEComplete() +{ + m_emit->mov(GetHostReg32(RRETURN), m_emit->dword[GetCPUPtrReg() + offsetof(State, pending_ticks)]); + m_emit->mov(GetHostReg32(RARG1), m_emit->dword[GetCPUPtrReg() + offsetof(State, gte_completion_tick)]); + + if (m_delayed_cycles_add > 0) + { + m_emit->add(GetHostReg32(RRETURN), static_cast(m_delayed_cycles_add)); + m_delayed_cycles_add = 0; + } + + m_emit->cmp(GetHostReg32(RARG1), GetHostReg32(RRETURN)); + m_emit->cmova(GetHostReg32(RRETURN), GetHostReg32(RARG1)); + m_emit->mov(m_emit->dword[GetCPUPtrReg() + offsetof(State, pending_ticks)], GetHostReg32(RRETURN)); +} + void CodeGenerator::EmitBranch(const void* address, bool allow_scratch) { const s64 jump_distance = diff --git a/src/core/gte.cpp b/src/core/gte.cpp index 6ae096391..809f604d8 100644 --- a/src/core/gte.cpp +++ b/src/core/gte.cpp @@ -3,10 +3,12 @@ #include "common/bitutils.h" #include "common/state_wrapper.h" #include "cpu_core.h" +#include "cpu_core_private.h" #include "host_display.h" #include "host_interface.h" #include "pgxp.h" #include "settings.h" +#include "timing_event.h" #include #include #include @@ -1157,11 +1159,13 @@ void ExecuteInstruction(u32 inst_bits) switch (inst.command) { case 0x01: + CPU::AddGTETicks(15); Execute_RTPS(inst); break; case 0x06: { + CPU::AddGTETicks(8); if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling) Execute_NCLIP_PGXP(inst); else @@ -1170,82 +1174,102 @@ void ExecuteInstruction(u32 inst_bits) break; case 0x0C: + CPU::AddGTETicks(6); Execute_OP(inst); break; case 0x10: + CPU::AddGTETicks(8); Execute_DPCS(inst); break; case 0x11: + CPU::AddGTETicks(7); Execute_INTPL(inst); break; case 0x12: + CPU::AddGTETicks(8); Execute_MVMVA(inst); break; case 0x13: + CPU::AddGTETicks(19); Execute_NCDS(inst); break; case 0x14: + CPU::AddGTETicks(13); Execute_CDP(inst); break; case 0x16: + CPU::AddGTETicks(44); Execute_NCDT(inst); break; case 0x1B: + CPU::AddGTETicks(17); Execute_NCCS(inst); break; case 0x1C: + CPU::AddGTETicks(11); Execute_CC(inst); break; case 0x1E: + CPU::AddGTETicks(14); Execute_NCS(inst); break; case 0x20: + CPU::AddGTETicks(30); Execute_NCT(inst); break; case 0x28: + CPU::AddGTETicks(5); Execute_SQR(inst); break; case 0x29: + CPU::AddGTETicks(8); Execute_DCPL(inst); break; case 0x2A: + CPU::AddGTETicks(17); Execute_DPCT(inst); break; case 0x2D: + CPU::AddGTETicks(5); Execute_AVSZ3(inst); break; case 0x2E: + CPU::AddGTETicks(6); Execute_AVSZ4(inst); break; case 0x30: + CPU::AddGTETicks(23); Execute_RTPT(inst); break; case 0x3D: + CPU::AddGTETicks(5); Execute_GPF(inst); break; case 0x3E: + CPU::AddGTETicks(5); Execute_GPL(inst); break; case 0x3F: + CPU::AddGTETicks(39); Execute_NCCT(inst); break; @@ -1255,16 +1279,18 @@ void ExecuteInstruction(u32 inst_bits) } } -InstructionImpl GetInstructionImpl(u32 inst_bits) +InstructionImpl GetInstructionImpl(u32 inst_bits, TickCount* ticks) { const Instruction inst{inst_bits}; switch (inst.command) { case 0x01: + *ticks = 15; return &Execute_RTPS; case 0x06: { + *ticks = 8; if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling) return &Execute_NCLIP_PGXP; else @@ -1272,63 +1298,83 @@ InstructionImpl GetInstructionImpl(u32 inst_bits) } case 0x0C: + *ticks = 6; return &Execute_OP; case 0x10: + *ticks = 8; return &Execute_DPCS; case 0x11: + *ticks = 7; return &Execute_INTPL; case 0x12: + *ticks = 8; return &Execute_MVMVA; case 0x13: + *ticks = 19; return &Execute_NCDS; case 0x14: + *ticks = 13; return &Execute_CDP; case 0x16: + *ticks = 44; return &Execute_NCDT; case 0x1B: + *ticks = 17; return &Execute_NCCS; case 0x1C: + *ticks = 11; return &Execute_CC; case 0x1E: + *ticks = 14; return &Execute_NCS; case 0x20: + *ticks = 30; return &Execute_NCT; case 0x28: + *ticks = 5; return &Execute_SQR; case 0x29: + *ticks = 8; return &Execute_DCPL; case 0x2A: + *ticks = 17; return &Execute_DPCT; case 0x2D: + *ticks = 5; return &Execute_AVSZ3; case 0x2E: + *ticks = 6; return &Execute_AVSZ4; case 0x30: + *ticks = 23; return &Execute_RTPT; case 0x3D: + *ticks = 5; return &Execute_GPF; case 0x3E: + *ticks = 5; return &Execute_GPL; case 0x3F: + *ticks = 39; return &Execute_NCCT; default: diff --git a/src/core/gte.h b/src/core/gte.h index 217a897b7..63cc6abbb 100644 --- a/src/core/gte.h +++ b/src/core/gte.h @@ -20,6 +20,6 @@ u32* GetRegisterPtr(u32 index); void ExecuteInstruction(u32 inst_bits); using InstructionImpl = void (*)(Instruction); -InstructionImpl GetInstructionImpl(u32 inst_bits); +InstructionImpl GetInstructionImpl(u32 inst_bits, TickCount* ticks); } // namespace GTE