From 2113405c7abb8ab05ac767384bf17061479cf1f9 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 13 Jun 2021 18:06:33 +1000 Subject: [PATCH] CPU/Recompiler: Elide tick flush completely where possible --- .../cpu_recompiler_code_generator_aarch32.cpp | 18 ++++++++++++++++-- .../cpu_recompiler_code_generator_aarch64.cpp | 17 +++++++++++++++-- .../cpu_recompiler_code_generator_generic.cpp | 6 ++---- src/core/cpu_recompiler_code_generator_x64.cpp | 18 +++++++++++++++--- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index 2cb83f2ba..397a75263 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -1276,8 +1276,6 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, break; } - EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS)); - bpi.host_code_size = static_cast( static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); @@ -1286,8 +1284,17 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, // generate slowmem fallback bpi.host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); + + // we add the ticks *after* the add here, since we counted incorrectly, then correct for it below + DebugAssert(m_delayed_cycles_add > 0); + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); + m_delayed_cycles_add += Bus::RAM_READ_TICKS; + EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + EmitAddCPUStructField(offsetof(State, pending_ticks), + Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); + // restore fastmem base state for the next instruction if (old_store_fastmem_base) fastmem_base = GetFastmemStoreBase(); @@ -1436,8 +1443,15 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, // generate slowmem fallback bpi.host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); + + DebugAssert(m_delayed_cycles_add > 0); + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); + EmitStoreGuestMemorySlowmem(cbi, address, size, actual_value, true); + EmitAddCPUStructField(offsetof(State, pending_ticks), + Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); + // restore fastmem base state for the next instruction if (old_load_fastmem_base) fastmem_base = GetFastmemLoadBase(); diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 52f3fc32e..178b629cf 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -1463,16 +1463,23 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, } } - EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS)); - bpi.host_code_size = static_cast( static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc))); // generate slowmem fallback bpi.host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); + + // we add the ticks *after* the add here, since we counted incorrectly, then correct for it below + DebugAssert(m_delayed_cycles_add > 0); + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); + m_delayed_cycles_add += Bus::RAM_READ_TICKS; + EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + EmitAddCPUStructField(offsetof(State, pending_ticks), + Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); + // return to the block code EmitBranch(GetCurrentNearCodePointer(), false); @@ -1638,8 +1645,14 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, bpi.host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); + DebugAssert(m_delayed_cycles_add > 0); + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); + EmitStoreGuestMemorySlowmem(cbi, address, size, value_in_hr, true); + EmitAddCPUStructField(offsetof(State, pending_ticks), + Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); + // return to the block code EmitBranch(GetCurrentNearCodePointer(), false); diff --git a/src/core/cpu_recompiler_code_generator_generic.cpp b/src/core/cpu_recompiler_code_generator_generic.cpp index 73b146b62..2585f1f29 100644 --- a/src/core/cpu_recompiler_code_generator_generic.cpp +++ b/src/core/cpu_recompiler_code_generator_generic.cpp @@ -57,8 +57,6 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const } } - AddPendingCycles(true); - Value result = m_register_cache.AllocateScratch(HostPointerSize); const bool use_fastmem = @@ -83,6 +81,7 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const } else { + AddPendingCycles(true); m_register_cache.FlushCallerSavedGuestRegisters(true, true); EmitLoadGuestMemorySlowmem(cbi, address, size, result, false); } @@ -133,8 +132,6 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const } } - AddPendingCycles(true); - const bool use_fastmem = (address_spec ? Bus::CanUseFastmemForAddress(*address_spec) : true) && !SpeculativeIsCacheIsolated(); if (address_spec) @@ -157,6 +154,7 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const } else { + AddPendingCycles(true); m_register_cache.FlushCallerSavedGuestRegisters(true, true); EmitStoreGuestMemorySlowmem(cbi, address, size, value, false); } diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index d08c766a6..44592435a 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -1955,9 +1955,6 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, } } - // TODO: BIOS reads... - EmitAddCPUStructField(offsetof(CPU::State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS)); - // insert nops, we need at least 5 bytes for a relative jump const u32 fastmem_size = static_cast(static_cast(GetCurrentNearCodePointer()) - static_cast(bpi.host_pc)); @@ -1972,8 +1969,17 @@ void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, m_far_emitter.align(16); bpi.host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); + + // we add the ticks *after* the add here, since we counted incorrectly, then correct for it below + DebugAssert(m_delayed_cycles_add > 0); + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); + m_delayed_cycles_add += Bus::RAM_READ_TICKS; + EmitLoadGuestMemorySlowmem(cbi, address, size, result, true); + EmitAddCPUStructField(offsetof(State, pending_ticks), + Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); + // return to the block code m_emit->jmp(GetCurrentNearCodePointer()); @@ -2234,8 +2240,14 @@ void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, bpi.host_slowmem_pc = GetCurrentFarCodePointer(); SwitchToFarCode(); + DebugAssert(m_delayed_cycles_add > 0); + EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(static_cast(m_delayed_cycles_add))); + EmitStoreGuestMemorySlowmem(cbi, address, size, value, true); + EmitAddCPUStructField(offsetof(State, pending_ticks), + Value::FromConstantU32(static_cast(-m_delayed_cycles_add))); + // return to the block code m_emit->jmp(GetCurrentNearCodePointer());