CPU/Recompiler: Elide tick flush completely where possible

This commit is contained in:
Connor McLaughlin 2021-06-13 18:06:33 +10:00
parent e8ac1fca80
commit 2113405c7a
4 changed files with 48 additions and 11 deletions

View file

@ -1276,8 +1276,6 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
break; break;
} }
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS));
bpi.host_code_size = static_cast<u32>( bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc))); static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
@ -1286,8 +1284,17 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
// generate slowmem fallback // generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer(); bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode(); SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// restore fastmem base state for the next instruction // restore fastmem base state for the next instruction
if (old_store_fastmem_base) if (old_store_fastmem_base)
fastmem_base = GetFastmemStoreBase(); fastmem_base = GetFastmemStoreBase();
@ -1436,8 +1443,15 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
// generate slowmem fallback // generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer(); bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode(); SwitchToFarCode();
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
EmitStoreGuestMemorySlowmem(cbi, address, size, actual_value, true); EmitStoreGuestMemorySlowmem(cbi, address, size, actual_value, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// restore fastmem base state for the next instruction // restore fastmem base state for the next instruction
if (old_load_fastmem_base) if (old_load_fastmem_base)
fastmem_base = GetFastmemLoadBase(); fastmem_base = GetFastmemLoadBase();

View file

@ -1463,16 +1463,23 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
} }
} }
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS));
bpi.host_code_size = static_cast<u32>( bpi.host_code_size = static_cast<u32>(
static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc))); static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
// generate slowmem fallback // generate slowmem fallback
bpi.host_slowmem_pc = GetCurrentFarCodePointer(); bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode(); SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// return to the block code // return to the block code
EmitBranch(GetCurrentNearCodePointer(), false); EmitBranch(GetCurrentNearCodePointer(), false);
@ -1638,8 +1645,14 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
bpi.host_slowmem_pc = GetCurrentFarCodePointer(); bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode(); SwitchToFarCode();
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
EmitStoreGuestMemorySlowmem(cbi, address, size, value_in_hr, true); EmitStoreGuestMemorySlowmem(cbi, address, size, value_in_hr, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// return to the block code // return to the block code
EmitBranch(GetCurrentNearCodePointer(), false); EmitBranch(GetCurrentNearCodePointer(), false);

View file

@ -57,8 +57,6 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
} }
} }
AddPendingCycles(true);
Value result = m_register_cache.AllocateScratch(HostPointerSize); Value result = m_register_cache.AllocateScratch(HostPointerSize);
const bool use_fastmem = const bool use_fastmem =
@ -83,6 +81,7 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
} }
else else
{ {
AddPendingCycles(true);
m_register_cache.FlushCallerSavedGuestRegisters(true, true); m_register_cache.FlushCallerSavedGuestRegisters(true, true);
EmitLoadGuestMemorySlowmem(cbi, address, size, result, false); EmitLoadGuestMemorySlowmem(cbi, address, size, result, false);
} }
@ -133,8 +132,6 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
} }
} }
AddPendingCycles(true);
const bool use_fastmem = const bool use_fastmem =
(address_spec ? Bus::CanUseFastmemForAddress(*address_spec) : true) && !SpeculativeIsCacheIsolated(); (address_spec ? Bus::CanUseFastmemForAddress(*address_spec) : true) && !SpeculativeIsCacheIsolated();
if (address_spec) if (address_spec)
@ -157,6 +154,7 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
} }
else else
{ {
AddPendingCycles(true);
m_register_cache.FlushCallerSavedGuestRegisters(true, true); m_register_cache.FlushCallerSavedGuestRegisters(true, true);
EmitStoreGuestMemorySlowmem(cbi, address, size, value, false); EmitStoreGuestMemorySlowmem(cbi, address, size, value, false);
} }

View file

@ -1955,9 +1955,6 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
} }
} }
// TODO: BIOS reads...
EmitAddCPUStructField(offsetof(CPU::State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS));
// insert nops, we need at least 5 bytes for a relative jump // insert nops, we need at least 5 bytes for a relative jump
const u32 fastmem_size = const u32 fastmem_size =
static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)); static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc));
@ -1972,8 +1969,17 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi,
m_far_emitter.align(16); m_far_emitter.align(16);
bpi.host_slowmem_pc = GetCurrentFarCodePointer(); bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode(); SwitchToFarCode();
// we add the ticks *after* the add here, since we counted incorrectly, then correct for it below
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
m_delayed_cycles_add += Bus::RAM_READ_TICKS;
EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// return to the block code // return to the block code
m_emit->jmp(GetCurrentNearCodePointer()); m_emit->jmp(GetCurrentNearCodePointer());
@ -2234,8 +2240,14 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi,
bpi.host_slowmem_pc = GetCurrentFarCodePointer(); bpi.host_slowmem_pc = GetCurrentFarCodePointer();
SwitchToFarCode(); SwitchToFarCode();
DebugAssert(m_delayed_cycles_add > 0);
EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast<u32>(m_delayed_cycles_add)));
EmitStoreGuestMemorySlowmem(cbi, address, size, value, true); EmitStoreGuestMemorySlowmem(cbi, address, size, value, true);
EmitAddCPUStructField(offsetof(State, pending_ticks),
Value::FromConstantU32(static_cast<u32>(-m_delayed_cycles_add)));
// return to the block code // return to the block code
m_emit->jmp(GetCurrentNearCodePointer()); m_emit->jmp(GetCurrentNearCodePointer());