CPU: Simulate stalls from GTE instructions

This commit is contained in:
Connor McLaughlin 2021-07-15 21:57:52 +10:00
parent 3d4cdb6d92
commit 30db081a64
11 changed files with 223 additions and 6 deletions

View file

@ -181,7 +181,10 @@ bool DoState(StateWrapper& sw)
}
if (sw.IsReading())
{
UpdateFastmemBase();
g_state.gte_completion_tick = 0;
}
return !sw.HasError();
}
@ -1462,6 +1465,8 @@ restart_instruction:
return;
}
StallUntilGTEComplete();
if (inst.cop.IsCommonInstruction())
{
// TODO: Combine with cop0.
@ -1533,6 +1538,7 @@ restart_instruction:
if (!ReadMemoryWord(addr, &value))
return;
StallUntilGTEComplete();
GTE::WriteRegister(ZeroExtend32(static_cast<u8>(inst.i.rt.GetValue())), value);
if constexpr (pgxp_mode >= PGXPMode::Memory)
@ -1549,6 +1555,8 @@ restart_instruction:
return;
}
StallUntilGTEComplete();
const VirtualMemoryAddress addr = ReadReg(inst.i.rs) + inst.i.imm_sext32();
const u32 value = GTE::ReadRegister(ZeroExtend32(static_cast<u8>(inst.i.rt.GetValue())));
WriteMemoryWord(addr, value);
@ -1596,7 +1604,10 @@ void DispatchInterrupt()
// instruction. For some reason, if we don't do this, we end up with incorrectly sorted polygons and flickering..
SafeReadInstruction(g_state.regs.pc, &g_state.next_instruction.bits);
if (g_state.next_instruction.op == InstructionOp::cop2 && !g_state.next_instruction.cop.IsCommonInstruction())
{
StallUntilGTEComplete();
GTE::ExecuteInstruction(g_state.next_instruction.bits);
}
// Interrupt raising occurs before the start of the instruction.
RaiseException(

View file

@ -46,8 +46,9 @@ union CacheControl
struct State
{
// ticks the CPU has executed
TickCount pending_ticks = 0;
TickCount downcount = 0;
TickCount pending_ticks = 0;
TickCount gte_completion_tick = 0;
Registers regs = {};
Cop0Registers cop0_regs = {};
@ -118,6 +119,8 @@ ALWAYS_INLINE TickCount GetPendingTicks()
}
ALWAYS_INLINE void ResetPendingTicks()
{
g_state.gte_completion_tick =
(g_state.pending_ticks < g_state.gte_completion_tick) ? (g_state.gte_completion_tick - g_state.pending_ticks) : 0;
g_state.pending_ticks = 0;
}
ALWAYS_INLINE void AddPendingTicks(TickCount ticks)

View file

@ -111,4 +111,15 @@ bool WriteMemoryWord(VirtualMemoryAddress addr, u32 value);
void* GetDirectReadMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize size, TickCount* read_ticks);
void* GetDirectWriteMemoryPointer(VirtualMemoryAddress address, MemoryAccessSize size);
ALWAYS_INLINE void AddGTETicks(TickCount ticks)
{
g_state.gte_completion_tick = g_state.pending_ticks + ticks + 1;
}
ALWAYS_INLINE void StallUntilGTEComplete()
{
g_state.pending_ticks =
(g_state.gte_completion_tick > g_state.pending_ticks) ? g_state.gte_completion_tick : g_state.pending_ticks;
}
} // namespace CPU

View file

@ -964,6 +964,7 @@ void CodeGenerator::BlockPrologue()
m_branch_was_taken_dirty = g_settings.cpu_recompiler_memory_exceptions;
m_current_instruction_was_branch_taken_dirty = false;
m_load_delay_dirty = true;
m_gte_busy_cycles_dirty = true;
m_pc_offset = 0;
m_current_instruction_pc_offset = 0;
@ -1067,13 +1068,63 @@ void CodeGenerator::TruncateBlockAtCurrentInstruction()
void CodeGenerator::AddPendingCycles(bool commit)
{
if (m_delayed_cycles_add == 0)
if (m_delayed_cycles_add == 0 && m_gte_done_cycle <= m_delayed_cycles_add)
return;
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(m_delayed_cycles_add));
if (m_gte_done_cycle > m_delayed_cycles_add)
{
Value temp = m_register_cache.AllocateScratch(RegSize_32);
EmitLoadCPUStructField(temp.GetHostRegister(), RegSize_32, offsetof(State, pending_ticks));
if (m_delayed_cycles_add > 0)
{
EmitAdd(temp.GetHostRegister(), temp.GetHostRegister(), Value::FromConstantU32(m_delayed_cycles_add), false);
EmitStoreCPUStructField(offsetof(State, pending_ticks), temp);
EmitAdd(temp.GetHostRegister(), temp.GetHostRegister(),
Value::FromConstantU32(m_gte_done_cycle - m_delayed_cycles_add), false);
EmitStoreCPUStructField(offsetof(State, gte_completion_tick), temp);
}
else
{
EmitAdd(temp.GetHostRegister(), temp.GetHostRegister(), Value::FromConstantU32(m_gte_done_cycle), false);
EmitStoreCPUStructField(offsetof(State, gte_completion_tick), temp);
}
}
else
{
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(m_delayed_cycles_add));
}
if (commit)
{
m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_delayed_cycles_add, 0);
m_delayed_cycles_add = 0;
}
}
void CodeGenerator::AddGTETicks(TickCount ticks)
{
m_gte_done_cycle = m_delayed_cycles_add + ticks;
Log_DebugPrintf("Adding %d GTE ticks", ticks);
}
void CodeGenerator::StallUntilGTEComplete()
{
if (!m_gte_busy_cycles_dirty)
{
// simple case - in block scheduling
if (m_gte_done_cycle > m_delayed_cycles_add)
{
Log_DebugPrintf("Stalling for %d ticks from GTE", m_gte_done_cycle - m_delayed_cycles_add);
m_delayed_cycles_add += (m_gte_done_cycle - m_delayed_cycles_add);
}
return;
}
// switch to in block scheduling
EmitStallUntilGTEComplete();
m_gte_done_cycle = 0;
m_gte_busy_cycles_dirty = false;
}
Value CodeGenerator::CalculatePC(u32 offset /* = 0 */)
@ -2740,6 +2791,7 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi)
{
if (cbi.instruction.op == InstructionOp::lwc2 || cbi.instruction.op == InstructionOp::swc2)
{
StallUntilGTEComplete();
InstructionPrologue(cbi, 1);
const u32 reg = static_cast<u32>(cbi.instruction.i.rt.GetValue());
@ -2786,6 +2838,7 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi)
const u32 reg = static_cast<u32>(cbi.instruction.r.rd.GetValue()) +
((cbi.instruction.cop.CommonOp() == CopCommonInstruction::cfcn) ? 32 : 0);
StallUntilGTEComplete();
InstructionPrologue(cbi, 1);
Value value = DoGTERegisterRead(reg);
@ -2811,6 +2864,7 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi)
const u32 reg = static_cast<u32>(cbi.instruction.r.rd.GetValue()) +
((cbi.instruction.cop.CommonOp() == CopCommonInstruction::ctcn) ? 32 : 0);
StallUntilGTEComplete();
InstructionPrologue(cbi, 1);
Value value = m_register_cache.ReadGuestRegister(cbi.instruction.r.rt);
@ -2833,11 +2887,16 @@ bool CodeGenerator::Compile_cop2(const CodeBlockInstruction& cbi)
}
else
{
TickCount func_ticks;
GTE::InstructionImpl func = GTE::GetInstructionImpl(cbi.instruction.bits, &func_ticks);
// forward everything to the GTE.
StallUntilGTEComplete();
InstructionPrologue(cbi, 1);
Value instruction_bits = Value::FromConstantU32(cbi.instruction.bits & GTE::Instruction::REQUIRED_BITS_MASK);
EmitFunctionCall(nullptr, GTE::GetInstructionImpl(cbi.instruction.bits), instruction_bits);
EmitFunctionCall(nullptr, func, instruction_bits);
AddGTETicks(func_ticks);
InstructionEpilogue(cbi);
return true;

View file

@ -71,6 +71,7 @@ public:
void EmitMoveNextInterpreterLoadDelay();
void EmitCancelInterpreterLoadDelayForReg(Reg reg);
void EmitICacheCheckAndUpdate();
void EmitStallUntilGTEComplete();
void EmitLoadCPUStructField(HostReg host_reg, RegSize size, u32 offset);
void EmitStoreCPUStructField(u32 offset, const Value& value);
void EmitAddCPUStructField(u32 offset, const Value& value);
@ -200,6 +201,8 @@ private:
void InstructionEpilogue(const CodeBlockInstruction& cbi);
void TruncateBlockAtCurrentInstruction();
void AddPendingCycles(bool commit);
void AddGTETicks(TickCount ticks);
void StallUntilGTEComplete();
Value CalculatePC(u32 offset = 0);
Value GetCurrentInstructionPC(u32 offset = 0);
@ -244,6 +247,7 @@ private:
CodeEmitter* m_emit;
TickCount m_delayed_cycles_add = 0;
TickCount m_gte_done_cycle = 0;
TickCount m_pc_offset = 0;
TickCount m_current_instruction_pc_offset = 0;
TickCount m_next_pc_offset = 0;
@ -254,6 +258,7 @@ private:
bool m_current_instruction_was_branch_taken_dirty = false;
bool m_load_delay_dirty = false;
bool m_next_load_delay_dirty = false;
bool m_gte_busy_cycles_dirty = false;
bool m_fastmem_load_base_in_register = false;
bool m_fastmem_store_base_in_register = false;

View file

@ -1695,6 +1695,24 @@ void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg)
m_emit->Bind(&skip_cancel);
}
void CodeGenerator::EmitStallUntilGTEComplete()
{
static_assert(offsetof(State, pending_ticks) + sizeof(u32) == offsetof(State, gte_completion_tick));
m_emit->ldr(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), offsetof(State, pending_ticks)));
m_emit->ldr(GetHostReg32(RARG2), a32::MemOperand(GetCPUPtrReg(), offsetof(State, gte_completion_tick)));
if (m_delayed_cycles_add > 0)
{
m_emit->Add(GetHostReg32(RARG1), GetHostReg32(RARG1), static_cast<u32>(m_delayed_cycles_add));
m_delayed_cycles_add = 0;
}
m_emit->cmp(GetHostReg32(RARG2), GetHostReg32(RARG1));
m_emit->mov(a32::hi, GetHostReg32(RARG1), GetHostReg32(RARG2));
m_emit->str(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), offsetof(State, pending_ticks)));
}
void CodeGenerator::EmitBranch(const void* address, bool allow_scratch)
{
const s32 displacement = GetPCDisplacement(GetCurrentCodePointer(), address);

View file

@ -1890,6 +1890,23 @@ void CodeGenerator::EmitCancelInterpreterLoadDelayForReg(Reg reg)
m_emit->Bind(&skip_cancel);
}
void CodeGenerator::EmitStallUntilGTEComplete()
{
static_assert(offsetof(State, pending_ticks) + sizeof(u32) == offsetof(State, gte_completion_tick));
m_emit->ldp(GetHostReg32(RARG1), GetHostReg32(RARG2),
a64::MemOperand(GetCPUPtrReg(), offsetof(State, pending_ticks)));
if (m_delayed_cycles_add > 0)
{
m_emit->Add(GetHostReg32(RARG1), GetHostReg32(RARG1), static_cast<u32>(m_delayed_cycles_add));
m_delayed_cycles_add = 0;
}
m_emit->cmp(GetHostReg32(RARG2), GetHostReg32(RARG1));
m_emit->csel(GetHostReg32(RARG1), GetHostReg32(RARG2), GetHostReg32(RARG1), a64::Condition::hi);
m_emit->str(GetHostReg32(RARG1), a64::MemOperand(GetCPUPtrReg(), offsetof(State, pending_ticks)));
}
void CodeGenerator::EmitBranch(const void* address, bool allow_scratch)
{
const s64 jump_distance =

View file

@ -207,4 +207,35 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
#endif
#if 0 // Not Used
void CodeGenerator::EmitStallUntilGTEComplete()
{
Value pending_ticks = m_register_cache.AllocateScratch(RegSize_32);
Value gte_completion_tick = m_register_cache.AllocateScratch(RegSize_32);
EmitLoadCPUStructField(pending_ticks.GetHostRegister(), RegSize_32, offsetof(State, pending_ticks));
EmitLoadCPUStructField(gte_completion_tick.GetHostRegister(), RegSize_32, offsetof(State, gte_completion_tick));
// commit cycles here, should always be nonzero
if (m_delayed_cycles_add > 0)
{
EmitAdd(pending_ticks.GetHostRegister(), pending_ticks.GetHostRegister(),
Value::FromConstantU32(m_delayed_cycles_add), false);
m_delayed_cycles_add = 0;
}
LabelType gte_done;
EmitSub(gte_completion_tick.GetHostRegister(), gte_completion_tick.GetHostRegister(), pending_ticks, true);
EmitConditionalBranch(Condition::Below, false, &gte_done);
// add stall ticks
EmitAdd(pending_ticks.GetHostRegister(), pending_ticks.GetHostRegister(), gte_completion_tick, false);
// store new ticks
EmitBindLabel(&gte_done);
EmitStoreCPUStructField(offsetof(State, pending_ticks), pending_ticks);
}
#endif
} // namespace CPU::Recompiler

View file

@ -2656,6 +2656,22 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
m_register_cache.UninhibitAllocation();
}
void CodeGenerator::EmitStallUntilGTEComplete()
{
m_emit->mov(GetHostReg32(RRETURN), m_emit->dword[GetCPUPtrReg() + offsetof(State, pending_ticks)]);
m_emit->mov(GetHostReg32(RARG1), m_emit->dword[GetCPUPtrReg() + offsetof(State, gte_completion_tick)]);
if (m_delayed_cycles_add > 0)
{
m_emit->add(GetHostReg32(RRETURN), static_cast<u32>(m_delayed_cycles_add));
m_delayed_cycles_add = 0;
}
m_emit->cmp(GetHostReg32(RARG1), GetHostReg32(RRETURN));
m_emit->cmova(GetHostReg32(RRETURN), GetHostReg32(RARG1));
m_emit->mov(m_emit->dword[GetCPUPtrReg() + offsetof(State, pending_ticks)], GetHostReg32(RRETURN));
}
void CodeGenerator::EmitBranch(const void* address, bool allow_scratch)
{
const s64 jump_distance =

View file

@ -3,10 +3,12 @@
#include "common/bitutils.h"
#include "common/state_wrapper.h"
#include "cpu_core.h"
#include "cpu_core_private.h"
#include "host_display.h"
#include "host_interface.h"
#include "pgxp.h"
#include "settings.h"
#include "timing_event.h"
#include <algorithm>
#include <array>
#include <numeric>
@ -1157,11 +1159,13 @@ void ExecuteInstruction(u32 inst_bits)
switch (inst.command)
{
case 0x01:
CPU::AddGTETicks(15);
Execute_RTPS(inst);
break;
case 0x06:
{
CPU::AddGTETicks(8);
if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling)
Execute_NCLIP_PGXP(inst);
else
@ -1170,82 +1174,102 @@ void ExecuteInstruction(u32 inst_bits)
break;
case 0x0C:
CPU::AddGTETicks(6);
Execute_OP(inst);
break;
case 0x10:
CPU::AddGTETicks(8);
Execute_DPCS(inst);
break;
case 0x11:
CPU::AddGTETicks(7);
Execute_INTPL(inst);
break;
case 0x12:
CPU::AddGTETicks(8);
Execute_MVMVA(inst);
break;
case 0x13:
CPU::AddGTETicks(19);
Execute_NCDS(inst);
break;
case 0x14:
CPU::AddGTETicks(13);
Execute_CDP(inst);
break;
case 0x16:
CPU::AddGTETicks(44);
Execute_NCDT(inst);
break;
case 0x1B:
CPU::AddGTETicks(17);
Execute_NCCS(inst);
break;
case 0x1C:
CPU::AddGTETicks(11);
Execute_CC(inst);
break;
case 0x1E:
CPU::AddGTETicks(14);
Execute_NCS(inst);
break;
case 0x20:
CPU::AddGTETicks(30);
Execute_NCT(inst);
break;
case 0x28:
CPU::AddGTETicks(5);
Execute_SQR(inst);
break;
case 0x29:
CPU::AddGTETicks(8);
Execute_DCPL(inst);
break;
case 0x2A:
CPU::AddGTETicks(17);
Execute_DPCT(inst);
break;
case 0x2D:
CPU::AddGTETicks(5);
Execute_AVSZ3(inst);
break;
case 0x2E:
CPU::AddGTETicks(6);
Execute_AVSZ4(inst);
break;
case 0x30:
CPU::AddGTETicks(23);
Execute_RTPT(inst);
break;
case 0x3D:
CPU::AddGTETicks(5);
Execute_GPF(inst);
break;
case 0x3E:
CPU::AddGTETicks(5);
Execute_GPL(inst);
break;
case 0x3F:
CPU::AddGTETicks(39);
Execute_NCCT(inst);
break;
@ -1255,16 +1279,18 @@ void ExecuteInstruction(u32 inst_bits)
}
}
InstructionImpl GetInstructionImpl(u32 inst_bits)
InstructionImpl GetInstructionImpl(u32 inst_bits, TickCount* ticks)
{
const Instruction inst{inst_bits};
switch (inst.command)
{
case 0x01:
*ticks = 15;
return &Execute_RTPS;
case 0x06:
{
*ticks = 8;
if (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling)
return &Execute_NCLIP_PGXP;
else
@ -1272,63 +1298,83 @@ InstructionImpl GetInstructionImpl(u32 inst_bits)
}
case 0x0C:
*ticks = 6;
return &Execute_OP;
case 0x10:
*ticks = 8;
return &Execute_DPCS;
case 0x11:
*ticks = 7;
return &Execute_INTPL;
case 0x12:
*ticks = 8;
return &Execute_MVMVA;
case 0x13:
*ticks = 19;
return &Execute_NCDS;
case 0x14:
*ticks = 13;
return &Execute_CDP;
case 0x16:
*ticks = 44;
return &Execute_NCDT;
case 0x1B:
*ticks = 17;
return &Execute_NCCS;
case 0x1C:
*ticks = 11;
return &Execute_CC;
case 0x1E:
*ticks = 14;
return &Execute_NCS;
case 0x20:
*ticks = 30;
return &Execute_NCT;
case 0x28:
*ticks = 5;
return &Execute_SQR;
case 0x29:
*ticks = 8;
return &Execute_DCPL;
case 0x2A:
*ticks = 17;
return &Execute_DPCT;
case 0x2D:
*ticks = 5;
return &Execute_AVSZ3;
case 0x2E:
*ticks = 6;
return &Execute_AVSZ4;
case 0x30:
*ticks = 23;
return &Execute_RTPT;
case 0x3D:
*ticks = 5;
return &Execute_GPF;
case 0x3E:
*ticks = 5;
return &Execute_GPL;
case 0x3F:
*ticks = 39;
return &Execute_NCCT;
default:

View file

@ -20,6 +20,6 @@ u32* GetRegisterPtr(u32 index);
void ExecuteInstruction(u32 inst_bits);
using InstructionImpl = void (*)(Instruction);
InstructionImpl GetInstructionImpl(u32 inst_bits);
InstructionImpl GetInstructionImpl(u32 inst_bits, TickCount* ticks);
} // namespace GTE