From 9501439d6b4647bfc840b7173774435f6132f8ef Mon Sep 17 00:00:00 2001 From: Stenzek Date: Wed, 4 Oct 2023 00:39:18 +1000 Subject: [PATCH] CPU: Add new experimental recompiler --- CMakeLists.txt | 4 + src/core/CMakeLists.txt | 33 + src/core/core.props | 1 + src/core/core.vcxproj | 14 + src/core/core.vcxproj.filters | 6 + src/core/cpu_code_cache.cpp | 19 +- src/core/cpu_code_cache_private.h | 2 +- src/core/cpu_core.cpp | 1 + src/core/cpu_newrec_compiler.cpp | 2277 +++++++++++++++ src/core/cpu_newrec_compiler.h | 465 ++++ src/core/cpu_newrec_compiler_aarch64.cpp | 2235 +++++++++++++++ src/core/cpu_newrec_compiler_aarch64.h | 164 ++ src/core/cpu_newrec_compiler_riscv64.cpp | 2453 +++++++++++++++++ src/core/cpu_newrec_compiler_riscv64.h | 168 ++ src/core/cpu_newrec_compiler_x64.cpp | 2196 +++++++++++++++ src/core/cpu_newrec_compiler_x64.h | 140 + .../cpu_recompiler_code_generator_x64.cpp | 3 +- src/core/cpu_recompiler_types.h | 37 + src/core/imgui_overlays.cpp | 5 + src/core/settings.cpp | 6 +- src/core/settings.h | 5 +- src/core/system.cpp | 2 +- src/core/types.h | 1 + 23 files changed, 10228 insertions(+), 9 deletions(-) create mode 100644 src/core/cpu_newrec_compiler.cpp create mode 100644 src/core/cpu_newrec_compiler.h create mode 100644 src/core/cpu_newrec_compiler_aarch64.cpp create mode 100644 src/core/cpu_newrec_compiler_aarch64.h create mode 100644 src/core/cpu_newrec_compiler_riscv64.cpp create mode 100644 src/core/cpu_newrec_compiler_riscv64.h create mode 100644 src/core/cpu_newrec_compiler_x64.cpp create mode 100644 src/core/cpu_newrec_compiler_x64.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 485be291d..d4d62cb80 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ endif() # Renderer options. option(ENABLE_OPENGL "Build with OpenGL renderer" ON) option(ENABLE_VULKAN "Build with Vulkan renderer" ON) +option(ENABLE_NEWREC "Build with experimental new dynarec (needed for RISC-V)" ON) # Global options. if(NOT ANDROID) @@ -171,6 +172,9 @@ elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm" OR "${CMAKE_SYSTEM_PROCESSOR}" endif() elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "riscv64") set(CPU_ARCH "riscv64") + + # Not done for us. Or we should inline atomics? + link_libraries("-latomic") else() message(FATAL_ERROR "Unknown system processor: ${CMAKE_SYSTEM_PROCESSOR}") endif() diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index d2e070ddb..ee3cdf607 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -121,6 +121,11 @@ set(RECOMPILER_SRCS cpu_recompiler_types.h ) +set(NEWREC_SOURCES + cpu_newrec_compiler.cpp + cpu_newrec_compiler.h +) + target_precompile_headers(core PRIVATE "pch.h") target_include_directories(core PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/..") target_include_directories(core PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/..") @@ -134,6 +139,15 @@ if(${CPU_ARCH} STREQUAL "x64") cpu_recompiler_code_generator_x64.cpp ) message("Building x64 recompiler") + + if(ENABLE_NEWREC) + target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1") + target_sources(core PRIVATE ${NEWREC_SOURCES} + cpu_newrec_compiler_x64.cpp + cpu_newrec_compiler_x64.h + ) + message("Building x64 newrec") + endif() elseif(${CPU_ARCH} STREQUAL "aarch32") target_compile_definitions(core PUBLIC "ENABLE_RECOMPILER=1") target_sources(core PRIVATE ${RECOMPILER_SRCS} @@ -148,6 +162,25 @@ elseif(${CPU_ARCH} STREQUAL "aarch64") ) target_link_libraries(core PUBLIC vixl) message("Building AArch64 recompiler") + if(ENABLE_NEWREC) + target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1") + target_sources(core PRIVATE ${NEWREC_SOURCES} + cpu_newrec_compiler_aarch64.cpp + cpu_newrec_compiler_aarch64.h + ) + message("Building AArch64 newrec") + endif() +elseif(${CPU_ARCH} STREQUAL "riscv64") + target_compile_definitions(core PUBLIC "ENABLE_MMAP_FASTMEM=1") + if(ENABLE_NEWREC) + target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1") + target_sources(core PRIVATE ${NEWREC_SOURCES} + cpu_newrec_compiler_riscv64.cpp + cpu_newrec_compiler_riscv64.h + ) + target_link_libraries(core PUBLIC biscuit::biscuit riscv-disas) + message("Building RISC-V 64-bit newrec") + endif() else() message("Not building recompiler") endif() diff --git a/src/core/core.props b/src/core/core.props index a4ceaf16b..9bd5357f4 100644 --- a/src/core/core.props +++ b/src/core/core.props @@ -8,6 +8,7 @@ ENABLE_RAINTEGRATION=1;%(PreprocessorDefinitions) ENABLE_RECOMPILER=1;%(PreprocessorDefinitions) ENABLE_MMAP_FASTMEM=1;%(PreprocessorDefinitions) + ENABLE_NEWREC=1;%(PreprocessorDefinitions) %(AdditionalIncludeDirectories);$(SolutionDir)dep\xxhash\include;$(SolutionDir)dep\zlib\include;$(SolutionDir)dep\rcheevos\include;$(SolutionDir)dep\rapidjson\include;$(SolutionDir)dep\discord-rpc\include %(AdditionalIncludeDirectories);$(SolutionDir)dep\rainterface diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj index 6366658d7..9846a201c 100644 --- a/src/core/core.vcxproj +++ b/src/core/core.vcxproj @@ -13,6 +13,13 @@ + + + true + + + true + true @@ -90,6 +97,13 @@ + + + true + + + true + true diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters index f0bd545d4..e6dea09a4 100644 --- a/src/core/core.vcxproj.filters +++ b/src/core/core.vcxproj.filters @@ -60,6 +60,9 @@ + + + @@ -125,5 +128,8 @@ + + + \ No newline at end of file diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 3edef3860..8bf28e196 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -21,6 +21,10 @@ Log_SetChannel(CPU::CodeCache); #include "cpu_recompiler_code_generator.h" #endif +#ifdef ENABLE_NEWREC +#include "cpu_newrec_compiler.h" +#endif + #include #include @@ -144,7 +148,8 @@ static u32 s_total_host_instructions_emitted = 0; bool CPU::CodeCache::IsUsingAnyRecompiler() { #ifdef ENABLE_RECOMPILER_SUPPORT - return g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler; + return (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler || + g_settings.cpu_execution_mode == CPUExecutionMode::NewRec); #else return false; #endif @@ -498,8 +503,8 @@ CPU::CodeCache::Block* CPU::CodeCache::CreateBlock(u32 pc, const BlockInstructio return block; } - // TODO: Only used by NewRec for now, don't waste time filling it. - if constexpr (false) + // Old rec doesn't use backprop info, don't waste time filling it. + if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec) FillBlockRegInfo(block); // add it to the tracking list for its page @@ -1419,6 +1424,10 @@ bool CPU::CodeCache::CompileBlock(Block* block) host_code = codegen.CompileBlock(block, &host_code_size, &host_far_code_size); } #endif +#ifdef ENABLE_NEWREC + if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec) + host_code = NewRec::g_compiler->CompileBlock(block, &host_code_size, &host_far_code_size); +#endif s_code_buffer.WriteProtect(true); @@ -1570,6 +1579,10 @@ void CPU::CodeCache::BackpatchLoadStore(void* host_pc, const LoadstoreBackpatchI if (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler) Recompiler::CodeGenerator::BackpatchLoadStore(host_pc, info); #endif +#ifdef ENABLE_NEWREC + if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec) + NewRec::BackpatchLoadStore(host_pc, info); +#endif s_code_buffer.WriteProtect(true); } diff --git a/src/core/cpu_code_cache_private.h b/src/core/cpu_code_cache_private.h index f1392e0d8..341fde10f 100644 --- a/src/core/cpu_code_cache_private.h +++ b/src/core/cpu_code_cache_private.h @@ -227,7 +227,7 @@ void InterpretUncachedBlock(); void LogCurrentState(); -#if defined(ENABLE_RECOMPILER) +#if defined(ENABLE_RECOMPILER) || defined(ENABLE_NEWREC) #define ENABLE_RECOMPILER_SUPPORT 1 #if defined(_DEBUG) || false diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 531453bde..cedabfde0 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -2231,6 +2231,7 @@ void CPU::Execute() { case CPUExecutionMode::Recompiler: case CPUExecutionMode::CachedInterpreter: + case CPUExecutionMode::NewRec: CodeCache::Execute(); break; diff --git a/src/core/cpu_newrec_compiler.cpp b/src/core/cpu_newrec_compiler.cpp new file mode 100644 index 000000000..5a3fb9b42 --- /dev/null +++ b/src/core/cpu_newrec_compiler.cpp @@ -0,0 +1,2277 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#include "cpu_newrec_compiler.h" +#include "common/assert.h" +#include "common/log.h" +#include "common/small_string.h" +#include "cpu_code_cache.h" +#include "cpu_core_private.h" +#include "cpu_disasm.h" +#include "pgxp.h" +#include "settings.h" +#include +#include +Log_SetChannel(NewRec::Compiler); + +// TODO: direct link skip delay slot check +// TODO: speculative constants +// TODO: std::bitset in msvc has bounds checks even in release... + +const std::array, 3> CPU::NewRec::Compiler::s_pgxp_mem_load_functions = { + {{{reinterpret_cast(&PGXP::CPU_LBx), reinterpret_cast(&PGXP::CPU_LBx)}}, + {{reinterpret_cast(&PGXP::CPU_LHU), reinterpret_cast(&PGXP::CPU_LH)}}, + {{reinterpret_cast(&PGXP::CPU_LW)}}}}; +const std::array CPU::NewRec::Compiler::s_pgxp_mem_store_functions = { + {reinterpret_cast(&PGXP::CPU_SB), reinterpret_cast(&PGXP::CPU_SH), + reinterpret_cast(&PGXP::CPU_SW)}}; + +CPU::NewRec::Compiler::Compiler() = default; + +CPU::NewRec::Compiler::~Compiler() = default; + +void CPU::NewRec::Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer, + u32 far_code_space) +{ + m_block = block; + m_compiler_pc = block->pc; + m_cycles = 0; + m_gte_done_cycle = 0; + inst = nullptr; + iinfo = nullptr; + m_current_instruction_pc = 0; + m_current_instruction_branch_delay_slot = false; + m_dirty_pc = false; + m_dirty_instruction_bits = false; + m_dirty_gte_done_cycle = true; + m_block_ended = false; + m_constant_reg_values.fill(0); + m_constant_regs_valid.reset(); + m_constant_regs_dirty.reset(); + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + ClearHostReg(i); + m_register_alloc_counter = 0; + + m_constant_reg_values[static_cast(Reg::zero)] = 0; + m_constant_regs_valid.set(static_cast(Reg::zero)); + + m_load_delay_dirty = EMULATE_LOAD_DELAYS; + m_load_delay_register = Reg::count; + m_load_delay_value_register = NUM_HOST_REGS; +} + +void CPU::NewRec::Compiler::BeginBlock() +{ +#if 0 + GenerateCall(reinterpret_cast(&CPU::CodeCache::LogCurrentState)); +#endif + + if (m_block->protection == CodeCache::PageProtectionMode::ManualCheck) + { + Log_DebugPrintf("Generate manual protection for PC %08X", m_block->pc); + const u8* ram_ptr = Bus::g_ram + VirtualAddressToPhysical(m_block->pc); + const u8* shadow_ptr = reinterpret_cast(m_block->Instructions()); + GenerateBlockProtectCheck(ram_ptr, shadow_ptr, m_block->size * sizeof(Instruction)); + } + + if (m_block->uncached_fetch_ticks > 0 || m_block->icache_line_count > 0) + GenerateICacheCheckAndUpdate(); + + if (g_settings.bios_tty_logging) + { + if (m_block->pc == 0xa0) + GenerateCall(reinterpret_cast(&CPU::HandleA0Syscall)); + else if (m_block->pc == 0xb0) + GenerateCall(reinterpret_cast(&CPU::HandleB0Syscall)); + } + + inst = m_block->Instructions(); + iinfo = m_block->InstructionsInfo(); + m_current_instruction_pc = m_block->pc; + m_current_instruction_branch_delay_slot = false; + m_compiler_pc += sizeof(Instruction); + m_dirty_pc = true; + m_dirty_instruction_bits = true; +} + +const void* CPU::NewRec::Compiler::CompileBlock(CodeCache::Block* block, u32* host_code_size, u32* host_far_code_size) +{ + JitCodeBuffer& buffer = CodeCache::GetCodeBuffer(); + Reset(block, buffer.GetFreeCodePointer(), buffer.GetFreeCodeSpace(), buffer.GetFreeFarCodePointer(), + buffer.GetFreeFarCodeSpace()); + + Log_DebugPrintf("Block range: %08X -> %08X", block->pc, block->pc + block->size * 4); + + BeginBlock(); + + for (;;) + { + CompileInstruction(); + + if (iinfo->is_last_instruction || m_block_ended) + { + if (!m_block_ended) + { + // Block was truncated. Link it. + EndBlock(m_compiler_pc, false); + } + + break; + } + + inst++; + iinfo++; + m_current_instruction_pc += sizeof(Instruction); + m_compiler_pc += sizeof(Instruction); + m_dirty_pc = true; + m_dirty_instruction_bits = true; + } + + // Nothing should be valid anymore + for (u32 i = 0; i < NUM_HOST_REGS; i++) + DebugAssert(!IsHostRegAllocated(i)); + for (u32 i = 1; i < static_cast(Reg::count); i++) + DebugAssert(!m_constant_regs_dirty.test(i) && !m_constant_regs_valid.test(i)); + + u32 code_size, far_code_size; + const void* code = EndCompile(&code_size, &far_code_size); + *host_code_size = code_size; + *host_far_code_size = far_code_size; + buffer.CommitCode(code_size); + buffer.CommitFarCode(far_code_size); + + return code; +} + +void CPU::NewRec::Compiler::SetConstantReg(Reg r, u32 v) +{ + DebugAssert(r < Reg::count && r != Reg::zero); + + // There might still be an incoming load delay which we need to cancel. + CancelLoadDelaysToReg(r); + + if (m_constant_regs_valid.test(static_cast(r)) && m_constant_reg_values[static_cast(r)] == v) + { + // Shouldn't be any host regs though. + DebugAssert(!CheckHostReg(0, HR_TYPE_CPU_REG, r).has_value()); + return; + } + + m_constant_reg_values[static_cast(r)] = v; + m_constant_regs_valid.set(static_cast(r)); + m_constant_regs_dirty.set(static_cast(r)); + + if (const std::optional hostreg = CheckHostReg(0, HR_TYPE_CPU_REG, r); hostreg.has_value()) + { + Log_DebugPrintf("Discarding guest register %s in host register %s due to constant set", GetRegName(r), + GetHostRegName(hostreg.value())); + FreeHostReg(hostreg.value()); + } +} + +void CPU::NewRec::Compiler::CancelLoadDelaysToReg(Reg reg) +{ + if (m_load_delay_register != reg) + return; + + Log_DebugPrintf("Cancelling load delay to %s", GetRegName(reg)); + m_load_delay_register = Reg::count; + if (m_load_delay_value_register != NUM_HOST_REGS) + ClearHostReg(m_load_delay_value_register); +} + +void CPU::NewRec::Compiler::UpdateLoadDelay() +{ + if (m_load_delay_dirty) + { + // we shouldn't have a static load delay. + DebugAssert(!HasLoadDelay()); + + // have to invalidate registers, we might have one of them cached + // TODO: double check the order here, will we trash a new value? we shouldn't... + // thankfully since this only happens on the first instruction, we can get away with just killing anything which + // isn't in write mode, because nothing could've been written before it, and the new value overwrites any + // load-delayed value + Log_DebugPrintf("Invalidating non-dirty registers, and flushing load delay from state"); + + constexpr u32 req_flags = (HR_ALLOCATED | HR_MODE_WRITE); + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if (ra.type != HR_TYPE_CPU_REG || !IsHostRegAllocated(i) || ((ra.flags & req_flags) == req_flags)) + continue; + + Log_DebugPrintf("Freeing non-dirty cached register %s in %s", GetRegName(ra.reg), GetHostRegName(i)); + DebugAssert(!(ra.flags & HR_MODE_WRITE)); + ClearHostReg(i); + } + + // remove any non-dirty constants too + for (u32 i = 1; i < static_cast(Reg::count); i++) + { + if (!HasConstantReg(static_cast(i)) || HasDirtyConstantReg(static_cast(i))) + continue; + + Log_DebugPrintf("Clearing non-dirty constant %s", GetRegName(static_cast(i))); + ClearConstantReg(static_cast(i)); + } + + Flush(FLUSH_LOAD_DELAY_FROM_STATE); + } + + // commit the delayed register load + FinishLoadDelay(); + + // move next load delay forward + if (m_next_load_delay_register != Reg::count) + { + // if it somehow got flushed, read it back in. + if (m_next_load_delay_value_register == NUM_HOST_REGS) + { + AllocateHostReg(HR_MODE_READ, HR_TYPE_NEXT_LOAD_DELAY_VALUE, m_next_load_delay_register); + DebugAssert(m_next_load_delay_value_register != NUM_HOST_REGS); + } + + HostRegAlloc& ra = m_host_regs[m_next_load_delay_value_register]; + ra.flags |= HR_MODE_WRITE; + ra.type = HR_TYPE_LOAD_DELAY_VALUE; + + m_load_delay_register = m_next_load_delay_register; + m_load_delay_value_register = m_next_load_delay_value_register; + m_next_load_delay_register = Reg::count; + m_next_load_delay_value_register = NUM_HOST_REGS; + } +} + +void CPU::NewRec::Compiler::FinishLoadDelay() +{ + DebugAssert(!m_load_delay_dirty); + if (!HasLoadDelay()) + return; + + // we may need to reload the value.. + if (m_load_delay_value_register == NUM_HOST_REGS) + { + AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, m_load_delay_register); + DebugAssert(m_load_delay_value_register != NUM_HOST_REGS); + } + + // kill any (old) cached value for this register + DeleteMIPSReg(m_load_delay_register, false); + + Log_DebugPrintf("Finished delayed load to %s in host register %s", GetRegName(m_load_delay_register), + GetHostRegName(m_load_delay_value_register)); + + // and swap the mode over so it gets written back later + HostRegAlloc& ra = m_host_regs[m_load_delay_value_register]; + DebugAssert(ra.reg == m_load_delay_register); + ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | HR_ALLOCATED | HR_MODE_READ | HR_MODE_WRITE; + ra.counter = m_register_alloc_counter++; + ra.type = HR_TYPE_CPU_REG; + + // constants are gone + Log_DebugPrintf("Clearing constant in %s due to load delay", GetRegName(m_load_delay_register)); + ClearConstantReg(m_load_delay_register); + + m_load_delay_register = Reg::count; + m_load_delay_value_register = NUM_HOST_REGS; +} + +void CPU::NewRec::Compiler::FinishLoadDelayToReg(Reg reg) +{ + if (m_load_delay_dirty) + { + // inter-block :( + UpdateLoadDelay(); + return; + } + + if (m_load_delay_register != reg) + return; + + FinishLoadDelay(); +} + +u32 CPU::NewRec::Compiler::GetFlagsForNewLoadDelayedReg() const +{ + return g_settings.gpu_pgxp_enable ? (HR_MODE_WRITE | HR_CALLEE_SAVED) : (HR_MODE_WRITE); +} + +void CPU::NewRec::Compiler::ClearConstantReg(Reg r) +{ + DebugAssert(r < Reg::count && r != Reg::zero); + m_constant_reg_values[static_cast(r)] = 0; + m_constant_regs_valid.reset(static_cast(r)); + m_constant_regs_dirty.reset(static_cast(r)); +} + +void CPU::NewRec::Compiler::FlushConstantRegs(bool invalidate) +{ + for (u32 i = 1; i < static_cast(Reg::count); i++) + { + if (m_constant_regs_dirty.test(static_cast(i))) + FlushConstantReg(static_cast(i)); + if (invalidate) + ClearConstantReg(static_cast(i)); + } +} + +CPU::Reg CPU::NewRec::Compiler::MipsD() const +{ + return inst->r.rd; +} + +u32 CPU::NewRec::Compiler::GetConditionalBranchTarget(CompileFlags cf) const +{ + // compiler pc has already been advanced when swapping branch delay slots + const u32 current_pc = m_compiler_pc - (cf.delay_slot_swapped ? sizeof(Instruction) : 0); + return current_pc + (inst->i.imm_sext32() << 2); +} + +u32 CPU::NewRec::Compiler::GetBranchReturnAddress(CompileFlags cf) const +{ + // compiler pc has already been advanced when swapping branch delay slots + return m_compiler_pc + (cf.delay_slot_swapped ? 0 : sizeof(Instruction)); +} + +bool CPU::NewRec::Compiler::TrySwapDelaySlot(Reg rs, Reg rt, Reg rd) +{ + if constexpr (!SWAP_BRANCH_DELAY_SLOTS) + return false; + + const Instruction* next_instruction = inst + 1; + DebugAssert(next_instruction < (m_block->Instructions() + m_block->size)); + + const Reg opcode_rs = next_instruction->r.rs; + const Reg opcode_rt = next_instruction->r.rt; + const Reg opcode_rd = next_instruction->r.rd; + +#ifdef _DEBUG + TinyString disasm; + DisassembleInstruction(&disasm, m_current_instruction_pc + 4, next_instruction->bits); +#endif + + // Just in case we read it in the instruction.. but the block should end after this. + const Instruction* const backup_instruction = inst; + const u32 backup_instruction_pc = m_current_instruction_pc; + const bool backup_instruction_delay_slot = m_current_instruction_branch_delay_slot; + + if (next_instruction->bits == 0) + { + // nop + goto is_safe; + } + + // can't swap when the branch is the first instruction because of bloody load delays + if ((EMULATE_LOAD_DELAYS && m_block->pc == m_current_instruction_pc) || m_load_delay_dirty || + (HasLoadDelay() && (m_load_delay_register == rs || m_load_delay_register == rt || m_load_delay_register == rd))) + { + goto is_unsafe; + } + + switch (next_instruction->op) + { + case InstructionOp::addi: + case InstructionOp::addiu: + case InstructionOp::slti: + case InstructionOp::sltiu: + case InstructionOp::andi: + case InstructionOp::ori: + case InstructionOp::xori: + case InstructionOp::lui: + case InstructionOp::lb: + case InstructionOp::lh: + case InstructionOp::lwl: + case InstructionOp::lw: + case InstructionOp::lbu: + case InstructionOp::lhu: + case InstructionOp::lwr: + case InstructionOp::sb: + case InstructionOp::sh: + case InstructionOp::swl: + case InstructionOp::sw: + case InstructionOp::swr: + { + if ((rs != Reg::zero && rs == opcode_rt) || (rt != Reg::zero && rt == opcode_rt) || + (rd != Reg::zero && (rd == opcode_rs || rd == opcode_rt)) || + (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt))) + { + goto is_unsafe; + } + } + break; + + case InstructionOp::lwc2: // LWC2 + case InstructionOp::swc2: // SWC2 + break; + + case InstructionOp::funct: // SPECIAL + { + switch (next_instruction->r.funct) + { + case InstructionFunct::sll: + case InstructionFunct::srl: + case InstructionFunct::sra: + case InstructionFunct::sllv: + case InstructionFunct::srlv: + case InstructionFunct::srav: + case InstructionFunct::add: + case InstructionFunct::addu: + case InstructionFunct::sub: + case InstructionFunct::subu: + case InstructionFunct::and_: + case InstructionFunct::or_: + case InstructionFunct::xor_: + case InstructionFunct::nor: + case InstructionFunct::slt: + case InstructionFunct::sltu: + { + if ((rs != Reg::zero && rs == opcode_rd) || (rt != Reg::zero && rt == opcode_rd) || + (rd != Reg::zero && (rd == opcode_rs || rd == opcode_rt)) || + (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt || + m_load_delay_register == opcode_rd))) + { + goto is_unsafe; + } + } + break; + + case InstructionFunct::mult: + case InstructionFunct::multu: + case InstructionFunct::div: + case InstructionFunct::divu: + { + if (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt)) + goto is_unsafe; + } + break; + + default: + goto is_unsafe; + } + } + break; + + case InstructionOp::cop0: // COP0 + case InstructionOp::cop1: // COP1 + case InstructionOp::cop2: // COP2 + case InstructionOp::cop3: // COP3 + { + if (next_instruction->cop.IsCommonInstruction()) + { + switch (next_instruction->cop.CommonOp()) + { + case CopCommonInstruction::mfcn: // MFC0 + case CopCommonInstruction::cfcn: // CFC0 + { + if ((rs != Reg::zero && rs == opcode_rt) || (rt != Reg::zero && rt == opcode_rt) || + (rd != Reg::zero && rd == opcode_rt) || (HasLoadDelay() && m_load_delay_register == opcode_rt)) + { + goto is_unsafe; + } + } + break; + + case CopCommonInstruction::mtcn: // MTC0 + case CopCommonInstruction::ctcn: // CTC0 + break; + } + } + else + { + // swap when it's GTE + if (next_instruction->op != InstructionOp::cop2) + goto is_unsafe; + } + } + break; + + default: + goto is_unsafe; + } + +is_safe: +#ifdef _DEBUG + Log_DevFmt("Swapping delay slot {:08X} {}", m_current_instruction_pc + 4, disasm); +#endif + + CompileBranchDelaySlot(); + + inst = backup_instruction; + m_current_instruction_pc = backup_instruction_pc; + m_current_instruction_branch_delay_slot = backup_instruction_delay_slot; + return true; + +is_unsafe: +#ifdef _DEBUG + Log_DevFmt("NOT swapping delay slot {:08X} {}", m_current_instruction_pc + 4, disasm); +#endif + + return false; +} + +void CPU::NewRec::Compiler::SetCompilerPC(u32 newpc) +{ + m_compiler_pc = newpc; + m_dirty_pc = true; +} + +u32 CPU::NewRec::Compiler::GetFreeHostReg(u32 flags) +{ + const u32 req_flags = HR_USABLE | (flags & HR_CALLEE_SAVED); + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((m_host_regs[i].flags & (req_flags | HR_NEEDED | HR_ALLOCATED)) == req_flags) + return i; + } + + // find register with lowest counter + u32 lowest = NUM_HOST_REGS; + u16 lowest_count = std::numeric_limits::max(); + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + const HostRegAlloc& ra = m_host_regs[i]; + if ((ra.flags & (req_flags | HR_NEEDED)) != req_flags) + continue; + + DebugAssert(ra.flags & HR_ALLOCATED); + if (ra.type == HR_TYPE_TEMP) + { + // can't punt temps + continue; + } + + if (ra.counter < lowest_count) + { + lowest = i; + lowest_count = ra.counter; + } + } + + // + + AssertMsg(lowest != NUM_HOST_REGS, "Register allocation failed."); + + const HostRegAlloc& ra = m_host_regs[lowest]; + switch (ra.type) + { + case HR_TYPE_CPU_REG: + { + // If the register is needed later, and we're allocating a callee-saved register, try moving it to a caller-saved + // register. + if (iinfo->UsedTest(ra.reg) && flags & HR_CALLEE_SAVED) + { + u32 caller_saved_lowest = NUM_HOST_REGS; + u16 caller_saved_lowest_count = std::numeric_limits::max(); + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + constexpr u32 caller_req_flags = HR_USABLE; + constexpr u32 caller_req_mask = HR_USABLE | HR_NEEDED | HR_CALLEE_SAVED; + const HostRegAlloc& caller_ra = m_host_regs[i]; + if ((caller_ra.flags & caller_req_mask) != caller_req_flags) + continue; + + if (!(caller_ra.flags & HR_ALLOCATED)) + { + caller_saved_lowest = i; + caller_saved_lowest_count = 0; + break; + } + + if (caller_ra.type == HR_TYPE_TEMP) + continue; + + if (caller_ra.counter < caller_saved_lowest_count) + { + caller_saved_lowest = i; + caller_saved_lowest_count = caller_ra.counter; + } + } + + if (caller_saved_lowest_count < lowest_count) + { + Log_DebugPrintf("Moving caller-saved host register %s with MIPS register %s to %s for allocation", + GetHostRegName(lowest), GetRegName(ra.reg), GetHostRegName(caller_saved_lowest)); + if (IsHostRegAllocated(caller_saved_lowest)) + FreeHostReg(caller_saved_lowest); + CopyHostReg(caller_saved_lowest, lowest); + SwapHostRegAlloc(caller_saved_lowest, lowest); + DebugAssert(!IsHostRegAllocated(lowest)); + return lowest; + } + } + + Log_DebugPrintf("Freeing register %s in host register %s for allocation", GetHostRegName(lowest), + GetRegName(ra.reg)); + } + break; + case HR_TYPE_LOAD_DELAY_VALUE: + { + Log_DebugPrintf("Freeing load delay register %s in host register %s for allocation", GetHostRegName(lowest), + GetRegName(ra.reg)); + } + break; + case HR_TYPE_NEXT_LOAD_DELAY_VALUE: + { + Log_DebugPrintf("Freeing next load delay register %s in host register %s due for allocation", + GetHostRegName(lowest), GetRegName(ra.reg)); + } + break; + default: + { + Panic("Unknown type freed"); + } + break; + } + + FreeHostReg(lowest); + return lowest; +} + +const char* CPU::NewRec::Compiler::GetReadWriteModeString(u32 flags) +{ + if ((flags & (HR_MODE_READ | HR_MODE_WRITE)) == (HR_MODE_READ | HR_MODE_WRITE)) + return "read-write"; + else if (flags & HR_MODE_READ) + return "read-only"; + else if (flags & HR_MODE_WRITE) + return "write-only"; + else + return "UNKNOWN"; +} + +u32 CPU::NewRec::Compiler::AllocateHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */, + Reg reg /* = Reg::count */) +{ + // Cancel any load delays before booting anything out + if (flags & HR_MODE_WRITE && (type == HR_TYPE_CPU_REG || type == HR_TYPE_NEXT_LOAD_DELAY_VALUE)) + CancelLoadDelaysToReg(reg); + + // Already have a matching type? + if (type != HR_TYPE_TEMP) + { + const std::optional check_reg = CheckHostReg(flags, type, reg); + + // shouldn't be allocating >1 load delay in a single instruction.. + // TODO: prefer callee saved registers for load delay + DebugAssert((type != HR_TYPE_LOAD_DELAY_VALUE && type != HR_TYPE_NEXT_LOAD_DELAY_VALUE) || !check_reg.has_value()); + if (check_reg.has_value()) + return check_reg.value(); + } + + const u32 hreg = GetFreeHostReg(flags); + HostRegAlloc& ra = m_host_regs[hreg]; + ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | (flags & ALLOWED_HR_FLAGS) | HR_ALLOCATED | HR_NEEDED; + ra.type = type; + ra.reg = reg; + ra.counter = m_register_alloc_counter++; + + switch (type) + { + case HR_TYPE_CPU_REG: + { + DebugAssert(reg != Reg::zero); + + Log_DebugPrintf("Allocate host reg %s to guest reg %s in %s mode", GetHostRegName(hreg), GetRegName(reg), + GetReadWriteModeString(flags)); + + if (flags & HR_MODE_READ) + { + DebugAssert(ra.reg > Reg::zero && ra.reg < Reg::count); + + if (HasConstantReg(reg)) + { + // may as well flush it now + Log_DebugPrintf("Flush constant register in guest reg %s to host reg %s", GetRegName(reg), + GetHostRegName(hreg)); + LoadHostRegWithConstant(hreg, GetConstantRegU32(reg)); + m_constant_regs_dirty.reset(static_cast(reg)); + ra.flags |= HR_MODE_WRITE; + } + else + { + LoadHostRegFromCPUPointer(hreg, &g_state.regs.r[static_cast(reg)]); + } + } + + if (flags & HR_MODE_WRITE && HasConstantReg(reg)) + { + DebugAssert(reg != Reg::zero); + Log_DebugPrintf("Clearing constant register in guest reg %s due to write mode in %s", GetRegName(reg), + GetHostRegName(hreg)); + + ClearConstantReg(reg); + } + } + break; + + case HR_TYPE_LOAD_DELAY_VALUE: + { + DebugAssert(!m_load_delay_dirty && (!HasLoadDelay() || !(flags & HR_MODE_WRITE))); + Log_DebugPrintf("Allocating load delayed guest register %s in host reg %s in %s mode", GetRegName(reg), + GetHostRegName(hreg), GetReadWriteModeString(flags)); + m_load_delay_register = reg; + m_load_delay_value_register = hreg; + if (flags & HR_MODE_READ) + LoadHostRegFromCPUPointer(hreg, &g_state.load_delay_value); + } + break; + + case HR_TYPE_NEXT_LOAD_DELAY_VALUE: + { + Log_DebugPrintf("Allocating next load delayed guest register %s in host reg %s in %s mode", GetRegName(reg), + GetHostRegName(hreg), GetReadWriteModeString(flags)); + m_next_load_delay_register = reg; + m_next_load_delay_value_register = hreg; + if (flags & HR_MODE_READ) + LoadHostRegFromCPUPointer(hreg, &g_state.next_load_delay_value); + } + break; + + case HR_TYPE_TEMP: + { + DebugAssert(!(flags & (HR_MODE_READ | HR_MODE_WRITE))); + Log_DebugPrintf("Allocate host reg %s as temporary", GetHostRegName(hreg)); + } + break; + + default: + Panic("Unknown type"); + break; + } + + return hreg; +} + +std::optional CPU::NewRec::Compiler::CheckHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */, + Reg reg /* = Reg::count */) +{ + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if (!(ra.flags & HR_ALLOCATED) || ra.type != type || ra.reg != reg) + continue; + + DebugAssert(ra.flags & HR_MODE_READ); + if (flags & HR_MODE_WRITE) + { + DebugAssert(type == HR_TYPE_CPU_REG); + if (!(ra.flags & HR_MODE_WRITE)) + { + Log_DebugPrintf("Switch guest reg %s from read to read-write in host reg %s", GetRegName(reg), + GetHostRegName(i)); + } + + if (HasConstantReg(reg)) + { + DebugAssert(reg != Reg::zero); + Log_DebugPrintf("Clearing constant register in guest reg %s due to write mode in %s", GetRegName(reg), + GetHostRegName(i)); + + ClearConstantReg(reg); + } + } + + ra.flags |= (flags & ALLOWED_HR_FLAGS) | HR_NEEDED; + ra.counter = m_register_alloc_counter++; + + // Need a callee saved reg? + if (flags & HR_CALLEE_SAVED && !(ra.flags & HR_CALLEE_SAVED)) + { + // Need to move it to one which is + const u32 new_reg = GetFreeHostReg(HR_CALLEE_SAVED); + Log_DebugPrintf("Rename host reg %s to %s for callee saved", GetHostRegName(i), GetHostRegName(new_reg)); + + CopyHostReg(new_reg, i); + SwapHostRegAlloc(i, new_reg); + DebugAssert(!IsHostRegAllocated(i)); + return new_reg; + } + + return i; + } + + return std::nullopt; +} + +u32 CPU::NewRec::Compiler::AllocateTempHostReg(u32 flags) +{ + return AllocateHostReg(flags, HR_TYPE_TEMP); +} + +void CPU::NewRec::Compiler::SwapHostRegAlloc(u32 lhs, u32 rhs) +{ + HostRegAlloc& lra = m_host_regs[lhs]; + HostRegAlloc& rra = m_host_regs[rhs]; + + const u8 lra_flags = lra.flags; + lra.flags = (lra.flags & IMMUTABLE_HR_FLAGS) | (rra.flags & ~IMMUTABLE_HR_FLAGS); + rra.flags = (rra.flags & IMMUTABLE_HR_FLAGS) | (lra_flags & ~IMMUTABLE_HR_FLAGS); + std::swap(lra.type, rra.type); + std::swap(lra.reg, rra.reg); + std::swap(lra.counter, rra.counter); +} + +void CPU::NewRec::Compiler::FlushHostReg(u32 reg) +{ + HostRegAlloc& ra = m_host_regs[reg]; + if (ra.flags & HR_MODE_WRITE) + { + switch (ra.type) + { + case HR_TYPE_CPU_REG: + { + DebugAssert(ra.reg > Reg::zero && ra.reg < Reg::count); + Log_DebugPrintf("Flushing register %s in host register %s to state", GetRegName(ra.reg), GetHostRegName(reg)); + StoreHostRegToCPUPointer(reg, &g_state.regs.r[static_cast(ra.reg)]); + } + break; + + case HR_TYPE_LOAD_DELAY_VALUE: + { + DebugAssert(m_load_delay_value_register == reg); + Log_DebugPrintf("Flushing load delayed register %s in host register %s to state", GetRegName(ra.reg), + GetHostRegName(reg)); + + StoreHostRegToCPUPointer(reg, &g_state.load_delay_value); + m_load_delay_value_register = NUM_HOST_REGS; + } + break; + + case HR_TYPE_NEXT_LOAD_DELAY_VALUE: + { + DebugAssert(m_next_load_delay_value_register == reg); + Log_WarningPrintf("Flushing NEXT load delayed register %s in host register %s to state", GetRegName(ra.reg), + GetHostRegName(reg)); + + StoreHostRegToCPUPointer(reg, &g_state.next_load_delay_value); + m_next_load_delay_value_register = NUM_HOST_REGS; + } + break; + + default: + break; + } + + ra.flags = (ra.flags & ~HR_MODE_WRITE) | HR_MODE_READ; + } +} + +void CPU::NewRec::Compiler::FreeHostReg(u32 reg) +{ + DebugAssert(IsHostRegAllocated(reg)); + FlushHostReg(reg); + ClearHostReg(reg); +} + +void CPU::NewRec::Compiler::ClearHostReg(u32 reg) +{ + HostRegAlloc& ra = m_host_regs[reg]; + ra.flags &= IMMUTABLE_HR_FLAGS; + ra.type = HR_TYPE_TEMP; + ra.counter = 0; + ra.reg = Reg::count; +} + +void CPU::NewRec::Compiler::MarkRegsNeeded(HostRegAllocType type, Reg reg) +{ + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if (ra.flags & HR_ALLOCATED && ra.type == type && ra.reg == reg) + ra.flags |= HR_NEEDED; + } +} + +void CPU::NewRec::Compiler::RenameHostReg(u32 reg, u32 new_flags, HostRegAllocType new_type, Reg new_reg) +{ + // only supported for cpu regs for now + DebugAssert(new_type == HR_TYPE_TEMP || new_type == HR_TYPE_CPU_REG || new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE); + + const std::optional old_reg = CheckHostReg(0, new_type, new_reg); + if (old_reg.has_value()) + { + // don't writeback + ClearHostReg(old_reg.value()); + } + + // kill any load delay to this reg + if (new_type == HR_TYPE_CPU_REG || new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE) + CancelLoadDelaysToReg(new_reg); + + if (new_type == HR_TYPE_CPU_REG) + { + Log_DebugPrintf("Renaming host reg %s to guest reg %s", GetHostRegName(reg), GetRegName(new_reg)); + } + else if (new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE) + { + Log_DebugPrintf("Renaming host reg %s to load delayed guest reg %s", GetHostRegName(reg), GetRegName(new_reg)); + DebugAssert(m_next_load_delay_register == Reg::count && m_next_load_delay_value_register == NUM_HOST_REGS); + m_next_load_delay_register = new_reg; + m_next_load_delay_value_register = reg; + } + else + { + Log_DebugPrintf("Renaming host reg %s to temp", GetHostRegName(reg)); + } + + HostRegAlloc& ra = m_host_regs[reg]; + ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | HR_NEEDED | HR_ALLOCATED | (new_flags & ALLOWED_HR_FLAGS); + ra.counter = m_register_alloc_counter++; + ra.type = new_type; + ra.reg = new_reg; +} + +void CPU::NewRec::Compiler::ClearHostRegNeeded(u32 reg) +{ + DebugAssert(reg < NUM_HOST_REGS && IsHostRegAllocated(reg)); + HostRegAlloc& ra = m_host_regs[reg]; + if (ra.flags & HR_MODE_WRITE) + ra.flags |= HR_MODE_READ; + + ra.flags &= ~HR_NEEDED; +} + +void CPU::NewRec::Compiler::ClearHostRegsNeeded() +{ + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if (!(ra.flags & HR_ALLOCATED)) + continue; + + // shouldn't have any temps left + DebugAssert(ra.type != HR_TYPE_TEMP); + + if (ra.flags & HR_MODE_WRITE) + ra.flags |= HR_MODE_READ; + + ra.flags &= ~HR_NEEDED; + } +} + +void CPU::NewRec::Compiler::DeleteMIPSReg(Reg reg, bool flush) +{ + DebugAssert(reg != Reg::zero); + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if (ra.flags & HR_ALLOCATED && ra.type == HR_TYPE_CPU_REG && ra.reg == reg) + { + if (flush) + FlushHostReg(i); + ClearHostReg(i); + ClearConstantReg(reg); + return; + } + } + + if (flush) + FlushConstantReg(reg); + ClearConstantReg(reg); +} + +bool CPU::NewRec::Compiler::TryRenameMIPSReg(Reg to, Reg from, u32 fromhost, Reg other) +{ + // can't rename when in form Rd = Rs op Rt and Rd == Rs or Rd == Rt + if (to == from || to == other || !iinfo->RenameTest(from)) + return false; + + Log_DebugPrintf("Renaming MIPS register %s to %s", GetRegName(from), GetRegName(to)); + + if (iinfo->LiveTest(from)) + FlushHostReg(fromhost); + + // remove all references to renamed-to register + DeleteMIPSReg(to, false); + + // and do the actual rename, new register has been modified. + m_host_regs[fromhost].reg = to; + m_host_regs[fromhost].flags |= HR_MODE_READ | HR_MODE_WRITE; + return true; +} + +void CPU::NewRec::Compiler::UpdateHostRegCounters() +{ + const CodeCache::InstructionInfo* const info_end = m_block->InstructionsInfo() + m_block->size; + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if ((ra.flags & (HR_ALLOCATED | HR_NEEDED)) != HR_ALLOCATED) + continue; + + // Try not to punt out load delays. + if (ra.type != HR_TYPE_CPU_REG) + { + ra.counter = std::numeric_limits::max(); + continue; + } + + DebugAssert(IsHostRegAllocated(i)); + const CodeCache::InstructionInfo* cur = iinfo; + const Reg reg = ra.reg; + if (!(cur->reg_flags[static_cast(reg)] & CodeCache::RI_USED)) + { + ra.counter = 0; + continue; + } + + // order based on the number of instructions until this register is used + u16 counter_val = std::numeric_limits::max(); + for (; cur != info_end; cur++, counter_val--) + { + if (cur->ReadsReg(reg)) + break; + } + + ra.counter = counter_val; + } +} + +void CPU::NewRec::Compiler::Flush(u32 flags) +{ + // TODO: Flush unneeded caller-saved regs (backup/replace calle-saved needed with caller-saved) + if (flags & + (FLUSH_FREE_UNNEEDED_CALLER_SAVED_REGISTERS | FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_FREE_ALL_REGISTERS)) + { + const u32 req_mask = (flags & FLUSH_FREE_ALL_REGISTERS) ? + HR_ALLOCATED : + ((flags & FLUSH_FREE_CALLER_SAVED_REGISTERS) ? (HR_ALLOCATED | HR_CALLEE_SAVED) : + (HR_ALLOCATED | HR_CALLEE_SAVED | HR_NEEDED)); + constexpr u32 req_flags = HR_ALLOCATED; + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if ((ra.flags & req_mask) == req_flags) + FreeHostReg(i); + } + } + + if (flags & FLUSH_INVALIDATE_MIPS_REGISTERS) + { + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if (ra.flags & HR_ALLOCATED && ra.type == HR_TYPE_CPU_REG) + FreeHostReg(i); + } + + FlushConstantRegs(true); + } + else + { + if (flags & FLUSH_FLUSH_MIPS_REGISTERS) + { + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + if ((ra.flags & (HR_ALLOCATED | HR_MODE_WRITE)) == (HR_ALLOCATED | HR_MODE_WRITE) && ra.type == HR_TYPE_CPU_REG) + FlushHostReg(i); + } + + // flush any constant registers which are dirty too + FlushConstantRegs(false); + } + } +} + +void CPU::NewRec::Compiler::FlushConstantReg(Reg r) +{ + DebugAssert(m_constant_regs_valid.test(static_cast(r))); + Log_DebugPrintf("Writing back register %s with constant value 0x%08X", GetRegName(r), + m_constant_reg_values[static_cast(r)]); + StoreConstantToCPUPointer(m_constant_reg_values[static_cast(r)], &g_state.regs.r[static_cast(r)]); + m_constant_regs_dirty.reset(static_cast(r)); +} + +void CPU::NewRec::Compiler::BackupHostState() +{ + DebugAssert(m_host_state_backup_count < m_host_state_backup.size()); + + // need to back up everything... + HostStateBackup& bu = m_host_state_backup[m_host_state_backup_count]; + bu.cycles = m_cycles; + bu.gte_done_cycle = m_gte_done_cycle; + bu.compiler_pc = m_compiler_pc; + bu.dirty_pc = m_dirty_pc; + bu.dirty_instruction_bits = m_dirty_instruction_bits; + bu.dirty_gte_done_cycle = m_dirty_gte_done_cycle; + bu.block_ended = m_block_ended; + bu.inst = inst; + bu.current_instruction_pc = m_current_instruction_pc; + bu.current_instruction_delay_slot = m_current_instruction_branch_delay_slot; + bu.const_regs_valid = m_constant_regs_valid; + bu.const_regs_dirty = m_constant_regs_dirty; + bu.const_regs_values = m_constant_reg_values; + bu.host_regs = m_host_regs; + bu.register_alloc_counter = m_register_alloc_counter; + bu.load_delay_dirty = m_load_delay_dirty; + bu.load_delay_register = m_load_delay_register; + bu.load_delay_value_register = m_load_delay_value_register; + bu.next_load_delay_register = m_next_load_delay_register; + bu.next_load_delay_value_register = m_next_load_delay_value_register; + m_host_state_backup_count++; +} + +void CPU::NewRec::Compiler::RestoreHostState() +{ + DebugAssert(m_host_state_backup_count > 0); + m_host_state_backup_count--; + + HostStateBackup& bu = m_host_state_backup[m_host_state_backup_count]; + m_host_regs = std::move(bu.host_regs); + m_constant_reg_values = std::move(bu.const_regs_values); + m_constant_regs_dirty = std::move(bu.const_regs_dirty); + m_constant_regs_valid = std::move(bu.const_regs_valid); + m_current_instruction_branch_delay_slot = bu.current_instruction_delay_slot; + m_current_instruction_pc = bu.current_instruction_pc; + inst = bu.inst; + m_block_ended = bu.block_ended; + m_dirty_gte_done_cycle = bu.dirty_gte_done_cycle; + m_dirty_instruction_bits = bu.dirty_instruction_bits; + m_dirty_pc = bu.dirty_pc; + m_compiler_pc = bu.compiler_pc; + m_register_alloc_counter = bu.register_alloc_counter; + m_load_delay_dirty = bu.load_delay_dirty; + m_load_delay_register = bu.load_delay_register; + m_load_delay_value_register = bu.load_delay_value_register; + m_next_load_delay_register = bu.next_load_delay_register; + m_next_load_delay_value_register = bu.next_load_delay_value_register; + m_gte_done_cycle = bu.gte_done_cycle; + m_cycles = bu.cycles; +} + +void CPU::NewRec::Compiler::AddLoadStoreInfo(void* code_address, u32 code_size, u32 address_register, u32 data_register, + MemoryAccessSize size, bool is_signed, bool is_load) +{ + DebugAssert(CodeCache::IsUsingFastmem()); + DebugAssert(address_register < NUM_HOST_REGS); + DebugAssert(data_register < NUM_HOST_REGS); + + u32 gpr_bitmask = 0; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if (IsHostRegAllocated(i)) + gpr_bitmask |= (1u << i); + } + + CPU::CodeCache::AddLoadStoreInfo(code_address, code_size, m_current_instruction_pc, m_cycles, gpr_bitmask, + static_cast(address_register), static_cast(data_register), size, is_signed, + is_load); +} + +void CPU::NewRec::Compiler::CompileInstruction() +{ +#ifdef _DEBUG + TinyString str; + DisassembleInstruction(&str, m_current_instruction_pc, inst->bits); + Log_DebugFmt("Compiling{} {:08X}: {}", m_current_instruction_branch_delay_slot ? " branch delay slot" : "", + m_current_instruction_pc, str); +#endif + + m_cycles++; + + if (IsNopInstruction(*inst)) + { + UpdateLoadDelay(); + return; + } + + switch (inst->op) + { +#define PGXPFN(x) reinterpret_cast(&PGXP::x) + + // clang-format off + // TODO: PGXP for jalr + + case InstructionOp::funct: + { + switch (inst->r.funct) + { + case InstructionFunct::sll: CompileTemplate(&Compiler::Compile_sll_const, &Compiler::Compile_sll, PGXPFN(CPU_SLL), TF_WRITES_D | TF_READS_T); break; + case InstructionFunct::srl: CompileTemplate(&Compiler::Compile_srl_const, &Compiler::Compile_srl, PGXPFN(CPU_SRL), TF_WRITES_D | TF_READS_T); break; + case InstructionFunct::sra: CompileTemplate(&Compiler::Compile_sra_const, &Compiler::Compile_sra, PGXPFN(CPU_SRA), TF_WRITES_D | TF_READS_T); break; + case InstructionFunct::sllv: CompileTemplate(&Compiler::Compile_sllv_const, &Compiler::Compile_sllv, PGXPFN(CPU_SLLV), TF_WRITES_D | TF_READS_S | TF_READS_T); break; + case InstructionFunct::srlv: CompileTemplate(&Compiler::Compile_srlv_const, &Compiler::Compile_srlv, PGXPFN(CPU_SRLV), TF_WRITES_D | TF_READS_S | TF_READS_T); break; + case InstructionFunct::srav: CompileTemplate(&Compiler::Compile_srav_const, &Compiler::Compile_srav, PGXPFN(CPU_SRAV), TF_WRITES_D | TF_READS_S | TF_READS_T); break; + case InstructionFunct::jr: CompileTemplate(&Compiler::Compile_jr_const, &Compiler::Compile_jr, nullptr, TF_READS_S); break; + case InstructionFunct::jalr: CompileTemplate(&Compiler::Compile_jalr_const, &Compiler::Compile_jalr, nullptr, /*TF_WRITES_D |*/ TF_READS_S | TF_NO_NOP); break; + case InstructionFunct::syscall: Compile_syscall(); break; + case InstructionFunct::break_: Compile_break(); break; + case InstructionFunct::mfhi: CompileMoveRegTemplate(inst->r.rd, Reg::hi, g_settings.gpu_pgxp_cpu); break; + case InstructionFunct::mthi: CompileMoveRegTemplate(Reg::hi, inst->r.rs, g_settings.gpu_pgxp_cpu); break; + case InstructionFunct::mflo: CompileMoveRegTemplate(inst->r.rd, Reg::lo, g_settings.gpu_pgxp_cpu); break; + case InstructionFunct::mtlo: CompileMoveRegTemplate(Reg::lo, inst->r.rs, g_settings.gpu_pgxp_cpu); break; + case InstructionFunct::mult: CompileTemplate(&Compiler::Compile_mult_const, &Compiler::Compile_mult, PGXPFN(CPU_MULT), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI | TF_COMMUTATIVE); break; + case InstructionFunct::multu: CompileTemplate(&Compiler::Compile_multu_const, &Compiler::Compile_multu, PGXPFN(CPU_MULTU), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI | TF_COMMUTATIVE); break; + case InstructionFunct::div: CompileTemplate(&Compiler::Compile_div_const, &Compiler::Compile_div, PGXPFN(CPU_DIV), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI); break; + case InstructionFunct::divu: CompileTemplate(&Compiler::Compile_divu_const, &Compiler::Compile_divu, PGXPFN(CPU_DIVU), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI); break; + case InstructionFunct::add: CompileTemplate(&Compiler::Compile_add_const, &Compiler::Compile_add, PGXPFN(CPU_ADD), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_T); break; + case InstructionFunct::addu: CompileTemplate(&Compiler::Compile_addu_const, &Compiler::Compile_addu, PGXPFN(CPU_ADD), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break; + case InstructionFunct::sub: CompileTemplate(&Compiler::Compile_sub_const, &Compiler::Compile_sub, PGXPFN(CPU_SUB), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_T); break; + case InstructionFunct::subu: CompileTemplate(&Compiler::Compile_subu_const, &Compiler::Compile_subu, PGXPFN(CPU_SUB), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_RENAME_WITH_ZERO_T); break; + case InstructionFunct::and_: CompileTemplate(&Compiler::Compile_and_const, &Compiler::Compile_and, PGXPFN(CPU_AND_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE); break; + case InstructionFunct::or_: CompileTemplate(&Compiler::Compile_or_const, &Compiler::Compile_or, PGXPFN(CPU_OR_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break; + case InstructionFunct::xor_: CompileTemplate(&Compiler::Compile_xor_const, &Compiler::Compile_xor, PGXPFN(CPU_XOR_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break; + case InstructionFunct::nor: CompileTemplate(&Compiler::Compile_nor_const, &Compiler::Compile_nor, PGXPFN(CPU_NOR), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE); break; + case InstructionFunct::slt: CompileTemplate(&Compiler::Compile_slt_const, &Compiler::Compile_slt, PGXPFN(CPU_SLT), TF_WRITES_D | TF_READS_T | TF_READS_S); break; + case InstructionFunct::sltu: CompileTemplate(&Compiler::Compile_sltu_const, &Compiler::Compile_sltu, PGXPFN(CPU_SLTU), TF_WRITES_D | TF_READS_T | TF_READS_S); break; + + default: Panic("fixme funct"); break; + } + } + break; + + case InstructionOp::j: Compile_j(); break; + case InstructionOp::jal: Compile_jal(); break; + + case InstructionOp::b: CompileTemplate(&Compiler::Compile_b_const, &Compiler::Compile_b, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break; + case InstructionOp::blez: CompileTemplate(&Compiler::Compile_blez_const, &Compiler::Compile_blez, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break; + case InstructionOp::bgtz: CompileTemplate(&Compiler::Compile_bgtz_const, &Compiler::Compile_bgtz, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break; + case InstructionOp::beq: CompileTemplate(&Compiler::Compile_beq_const, &Compiler::Compile_beq, nullptr, TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_SWAP_DELAY_SLOT); break; + case InstructionOp::bne: CompileTemplate(&Compiler::Compile_bne_const, &Compiler::Compile_bne, nullptr, TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_SWAP_DELAY_SLOT); break; + + case InstructionOp::addi: CompileTemplate(&Compiler::Compile_addi_const, &Compiler::Compile_addi, PGXPFN(CPU_ADDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_IMM); break; + case InstructionOp::addiu: CompileTemplate(&Compiler::Compile_addiu_const, &Compiler::Compile_addiu, PGXPFN(CPU_ADDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break; + case InstructionOp::slti: CompileTemplate(&Compiler::Compile_slti_const, &Compiler::Compile_slti, PGXPFN(CPU_SLTI), TF_WRITES_T | TF_READS_S); break; + case InstructionOp::sltiu: CompileTemplate(&Compiler::Compile_sltiu_const, &Compiler::Compile_sltiu, PGXPFN(CPU_SLTIU), TF_WRITES_T | TF_READS_S); break; + case InstructionOp::andi: CompileTemplate(&Compiler::Compile_andi_const, &Compiler::Compile_andi, PGXPFN(CPU_ANDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE); break; + case InstructionOp::ori: CompileTemplate(&Compiler::Compile_ori_const, &Compiler::Compile_ori, PGXPFN(CPU_ORI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break; + case InstructionOp::xori: CompileTemplate(&Compiler::Compile_xori_const, &Compiler::Compile_xori, PGXPFN(CPU_XORI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break; + case InstructionOp::lui: Compile_lui(); break; + + case InstructionOp::lb: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Byte, false, true, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break; + case InstructionOp::lbu: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Byte, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break; + case InstructionOp::lh: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::HalfWord, false, true, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break; + case InstructionOp::lhu: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::HalfWord, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break; + case InstructionOp::lw: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Word, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break; + case InstructionOp::lwl: CompileLoadStoreTemplate(&Compiler::Compile_lwx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break; + case InstructionOp::lwr: CompileLoadStoreTemplate(&Compiler::Compile_lwx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break; + case InstructionOp::sb: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::Byte, true, false, TF_READS_S | TF_READS_T); break; + case InstructionOp::sh: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::HalfWord, true, false, TF_READS_S | TF_READS_T); break; + case InstructionOp::sw: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::Word, true, false, TF_READS_S | TF_READS_T); break; + case InstructionOp::swl: CompileLoadStoreTemplate(&Compiler::Compile_swx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break; + case InstructionOp::swr: CompileLoadStoreTemplate(&Compiler::Compile_swx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break; + + case InstructionOp::cop0: + { + if (inst->cop.IsCommonInstruction()) + { + switch (inst->cop.CommonOp()) + { + case CopCommonInstruction::mfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc0, nullptr, TF_WRITES_T | TF_LOAD_DELAY); } break; + case CopCommonInstruction::mtcn: CompileTemplate(nullptr, &Compiler::Compile_mtc0, PGXPFN(CPU_MTC0), TF_READS_T); break; + default: Compile_Fallback(); break; + } + } + else + { + switch (inst->cop.Cop0Op()) + { + case Cop0Instruction::rfe: CompileTemplate(nullptr, &Compiler::Compile_rfe, nullptr, 0); break; + default: Compile_Fallback(); break; + } + } + } + break; + + case InstructionOp::cop2: + { + if (inst->cop.IsCommonInstruction()) + { + switch (inst->cop.CommonOp()) + { + case CopCommonInstruction::mfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc2, nullptr, TF_GTE_STALL); } break; + case CopCommonInstruction::cfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc2, nullptr, TF_GTE_STALL); } break; + case CopCommonInstruction::mtcn: CompileTemplate(nullptr, &Compiler::Compile_mtc2, PGXPFN(CPU_MTC2), TF_GTE_STALL | TF_READS_T | TF_PGXP_WITHOUT_CPU); break; + case CopCommonInstruction::ctcn: CompileTemplate(nullptr, &Compiler::Compile_mtc2, PGXPFN(CPU_MTC2), TF_GTE_STALL | TF_READS_T | TF_PGXP_WITHOUT_CPU); break; + default: Compile_Fallback(); break; + } + } + else + { + // GTE ops + CompileTemplate(nullptr, &Compiler::Compile_cop2, nullptr, TF_GTE_STALL); + } + } + break; + + case InstructionOp::lwc2: CompileLoadStoreTemplate(&Compiler::Compile_lwc2, MemoryAccessSize::Word, false, false, TF_GTE_STALL | TF_READS_S | TF_LOAD_DELAY); break; + case InstructionOp::swc2: CompileLoadStoreTemplate(&Compiler::Compile_swc2, MemoryAccessSize::Word, true, false, TF_GTE_STALL | TF_READS_S); break; + + default: Panic("Fixme"); break; + // clang-format on + +#undef PGXPFN + } + + ClearHostRegsNeeded(); + UpdateLoadDelay(); + +#if 0 + const void* end = GetCurrentCodePointer(); + if (start != end && !m_current_instruction_branch_delay_slot) + { + CodeCache::DisassembleAndLogHostCode(start, + static_cast(static_cast(end) - static_cast(start))); + } +#endif +} + +void CPU::NewRec::Compiler::CompileBranchDelaySlot(bool dirty_pc /* = true */) +{ + // Update load delay at the end of the previous instruction. + UpdateLoadDelay(); + + // TODO: Move cycle add before this. + inst++; + iinfo++; + m_current_instruction_pc += sizeof(Instruction); + m_current_instruction_branch_delay_slot = true; + m_compiler_pc += sizeof(Instruction); + m_dirty_pc = dirty_pc; + m_dirty_instruction_bits = true; + + CompileInstruction(); + + m_current_instruction_branch_delay_slot = false; +} + +void CPU::NewRec::Compiler::CompileTemplate(void (Compiler::*const_func)(CompileFlags), + void (Compiler::*func)(CompileFlags), const void* pgxp_cpu_func, u32 tflags) +{ + // TODO: This is where we will do memory operand optimization. Remember to kill constants! + // TODO: Swap S and T if commutative + // TODO: For and, treat as zeroing if imm is zero + // TODO: Optimize slt + bne to cmp + jump + // TODO: Prefer memory operands when load delay is dirty, since we're going to invalidate immediately after the first + // instruction.. + // TODO: andi with zero -> zero const + // TODO: load constant so it can be flushed if it's not overwritten later + // TODO: inline PGXP ops. + // TODO: don't rename on sltu. + + bool allow_constant = static_cast(const_func); + Reg rs = inst->r.rs.GetValue(); + Reg rt = inst->r.rt.GetValue(); + Reg rd = inst->r.rd.GetValue(); + + if (tflags & TF_GTE_STALL) + StallUntilGTEComplete(); + + // throw away instructions writing to $zero + if (!(tflags & TF_NO_NOP) && (!g_settings.cpu_recompiler_memory_exceptions || !(tflags & TF_CAN_OVERFLOW)) && + ((tflags & TF_WRITES_T && rt == Reg::zero) || (tflags & TF_WRITES_D && rd == Reg::zero))) + { + Log_DebugPrintf("Skipping instruction because it writes to zero"); + return; + } + + // handle rename operations + if ((tflags & TF_RENAME_WITH_ZERO_T && HasConstantRegValue(rt, 0))) + { + DebugAssert((tflags & (TF_WRITES_D | TF_READS_S | TF_READS_T)) == (TF_WRITES_D | TF_READS_S | TF_READS_T)); + CompileMoveRegTemplate(rd, rs, true); + return; + } + else if ((tflags & (TF_RENAME_WITH_ZERO_T | TF_COMMUTATIVE)) == (TF_RENAME_WITH_ZERO_T | TF_COMMUTATIVE) && + HasConstantRegValue(rs, 0)) + { + DebugAssert((tflags & (TF_WRITES_D | TF_READS_S | TF_READS_T)) == (TF_WRITES_D | TF_READS_S | TF_READS_T)); + CompileMoveRegTemplate(rd, rt, true); + return; + } + else if (tflags & TF_RENAME_WITH_ZERO_IMM && inst->i.imm == 0) + { + CompileMoveRegTemplate(rt, rs, true); + return; + } + + if (pgxp_cpu_func && g_settings.gpu_pgxp_enable && ((tflags & TF_PGXP_WITHOUT_CPU) || g_settings.UsingPGXPCPUMode())) + { + std::array reg_args = {{Reg::count, Reg::count}}; + u32 num_reg_args = 0; + if (tflags & TF_READS_S) + reg_args[num_reg_args++] = rs; + if (tflags & TF_READS_T) + reg_args[num_reg_args++] = rt; + if (tflags & TF_READS_LO) + reg_args[num_reg_args++] = Reg::lo; + if (tflags & TF_READS_HI) + reg_args[num_reg_args++] = Reg::hi; + + DebugAssert(num_reg_args <= 2); + GeneratePGXPCallWithMIPSRegs(pgxp_cpu_func, inst->bits, reg_args[0], reg_args[1]); + } + + // if it's a commutative op, and we have one constant reg but not the other, swap them + // TODO: make it swap when writing to T as well + // TODO: drop the hack for rd == rt + if (tflags & TF_COMMUTATIVE && !(tflags & TF_WRITES_T) && + ((HasConstantReg(rs) && !HasConstantReg(rt)) || (tflags & TF_WRITES_D && rd == rt))) + { + Log_DebugPrintf("Swapping S:%s and T:%s due to commutative op and constants", GetRegName(rs), GetRegName(rt)); + std::swap(rs, rt); + } + + CompileFlags cf = {}; + + if (tflags & TF_READS_S) + { + MarkRegsNeeded(HR_TYPE_CPU_REG, rs); + if (HasConstantReg(rs)) + cf.const_s = true; + else + allow_constant = false; + } + if (tflags & TF_READS_T) + { + MarkRegsNeeded(HR_TYPE_CPU_REG, rt); + if (HasConstantReg(rt)) + cf.const_t = true; + else + allow_constant = false; + } + if (tflags & TF_READS_LO) + { + MarkRegsNeeded(HR_TYPE_CPU_REG, Reg::lo); + if (HasConstantReg(Reg::lo)) + cf.const_lo = true; + else + allow_constant = false; + } + if (tflags & TF_READS_HI) + { + MarkRegsNeeded(HR_TYPE_CPU_REG, Reg::hi); + if (HasConstantReg(Reg::hi)) + cf.const_hi = true; + else + allow_constant = false; + } + + // Needed because of potential swapping + if (tflags & TF_READS_S) + cf.mips_s = static_cast(rs); + if (tflags & (TF_READS_T | TF_WRITES_T)) + cf.mips_t = static_cast(rt); + + if (allow_constant) + { + // woot, constant path + (this->*const_func)(cf); + return; + } + + UpdateHostRegCounters(); + + if (tflags & TF_CAN_SWAP_DELAY_SLOT && TrySwapDelaySlot(cf.MipsS(), cf.MipsT())) + cf.delay_slot_swapped = true; + + if (tflags & TF_READS_S && + (tflags & TF_NEEDS_REG_S || !cf.const_s || (tflags & TF_WRITES_D && rd != Reg::zero && rd == rs))) + { + cf.host_s = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs); + cf.const_s = false; + cf.valid_host_s = true; + } + + if (tflags & TF_READS_T && + (tflags & (TF_NEEDS_REG_T | TF_WRITES_T) || !cf.const_t || (tflags & TF_WRITES_D && rd != Reg::zero && rd == rt))) + { + cf.host_t = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); + cf.const_t = false; + cf.valid_host_t = true; + } + + if (tflags & (TF_READS_LO | TF_WRITES_LO)) + { + cf.host_lo = + AllocateHostReg(((tflags & TF_READS_LO) ? HR_MODE_READ : 0u) | ((tflags & TF_WRITES_LO) ? HR_MODE_WRITE : 0u), + HR_TYPE_CPU_REG, Reg::lo); + cf.const_lo = false; + cf.valid_host_lo = true; + } + + if (tflags & (TF_READS_HI | TF_WRITES_HI)) + { + cf.host_hi = + AllocateHostReg(((tflags & TF_READS_HI) ? HR_MODE_READ : 0u) | ((tflags & TF_WRITES_HI) ? HR_MODE_WRITE : 0u), + HR_TYPE_CPU_REG, Reg::hi); + cf.const_hi = false; + cf.valid_host_hi = true; + } + + const HostRegAllocType write_type = + (tflags & TF_LOAD_DELAY && EMULATE_LOAD_DELAYS) ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG; + + if (tflags & TF_CAN_OVERFLOW && g_settings.cpu_recompiler_memory_exceptions) + { + // allocate a temp register for the result, then swap it back + const u32 tempreg = AllocateHostReg(0, HR_TYPE_TEMP); + ; + if (tflags & TF_WRITES_D) + { + cf.host_d = tempreg; + cf.valid_host_d = true; + } + else if (tflags & TF_WRITES_T) + { + cf.host_t = tempreg; + cf.valid_host_t = true; + } + + (this->*func)(cf); + + if (tflags & TF_WRITES_D && rd != Reg::zero) + { + DeleteMIPSReg(rd, false); + RenameHostReg(tempreg, HR_MODE_WRITE, write_type, rd); + } + else if (tflags & TF_WRITES_T && rt != Reg::zero) + { + DeleteMIPSReg(rt, false); + RenameHostReg(tempreg, HR_MODE_WRITE, write_type, rt); + } + else + { + FreeHostReg(tempreg); + } + } + else + { + if (tflags & TF_WRITES_D && rd != Reg::zero) + { + if (tflags & TF_READS_S && cf.valid_host_s && TryRenameMIPSReg(rd, rs, cf.host_s, Reg::count)) + cf.host_d = cf.host_s; + else + cf.host_d = AllocateHostReg(HR_MODE_WRITE, write_type, rd); + cf.valid_host_d = true; + } + + if (tflags & TF_WRITES_T && rt != Reg::zero) + { + if (tflags & TF_READS_S && cf.valid_host_s && TryRenameMIPSReg(rt, rs, cf.host_s, Reg::count)) + cf.host_t = cf.host_s; + else + cf.host_t = AllocateHostReg(HR_MODE_WRITE, write_type, rt); + cf.valid_host_t = true; + } + + (this->*func)(cf); + } +} + +void CPU::NewRec::Compiler::CompileLoadStoreTemplate(void (Compiler::*func)(CompileFlags, MemoryAccessSize, bool, + const std::optional&), + MemoryAccessSize size, bool store, bool sign, u32 tflags) +{ + const Reg rs = inst->i.rs; + const Reg rt = inst->i.rt; + + if (tflags & TF_GTE_STALL) + StallUntilGTEComplete(); + + CompileFlags cf = {}; + + if (tflags & TF_READS_S) + { + MarkRegsNeeded(HR_TYPE_CPU_REG, rs); + cf.mips_s = static_cast(rs); + } + if (tflags & (TF_READS_T | TF_WRITES_T)) + { + if (tflags & TF_READS_T) + MarkRegsNeeded(HR_TYPE_CPU_REG, rt); + cf.mips_t = static_cast(rt); + } + + UpdateHostRegCounters(); + + // constant address? + std::optional addr; + if (HasConstantReg(rs)) + { + addr = GetConstantRegU32(rs) + inst->i.imm_sext32(); + cf.const_s = true; + } + else + { + if constexpr (HAS_MEMORY_OPERANDS) + { + // don't bother caching it since we're going to flush anyway + // TODO: make less rubbish, if it's caller saved we don't need to flush... + const std::optional hreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs); + if (hreg.has_value()) + { + cf.valid_host_s = true; + cf.host_s = hreg.value(); + } + } + else + { + // need rs in a register + cf.host_s = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs); + cf.valid_host_s = true; + } + } + + // reads T -> store, writes T -> load + // for now, we defer the allocation to afterwards, because C call + if (tflags & TF_READS_T) + { + if (HasConstantReg(rt)) + { + cf.const_t = true; + } + else + { + if constexpr (HAS_MEMORY_OPERANDS) + { + const std::optional hreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); + if (hreg.has_value()) + { + cf.valid_host_t = true; + cf.host_t = hreg.value(); + } + } + else + { + cf.host_t = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); + cf.valid_host_t = true; + } + } + } + + (this->*func)(cf, size, sign, addr); +} + +void CPU::NewRec::Compiler::FlushForLoadStore(const std::optional& address, bool store) +{ + if (CodeCache::IsUsingFastmem() && !g_settings.cpu_recompiler_memory_exceptions) + return; + + // TODO: Stores don't need to flush GTE cycles... + Flush(FLUSH_FOR_C_CALL | FLUSH_FOR_LOADSTORE); +} + +void CPU::NewRec::Compiler::CompileMoveRegTemplate(Reg dst, Reg src, bool pgxp_move) +{ + if (dst == src || dst == Reg::zero) + return; + + if (HasConstantReg(src)) + { + DeleteMIPSReg(dst, false); + SetConstantReg(dst, GetConstantRegU32(src)); + } + else + { + const u32 srcreg = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, src); + if (!TryRenameMIPSReg(dst, src, srcreg, Reg::count)) + { + const u32 dstreg = AllocateHostReg(HR_MODE_WRITE, HR_TYPE_CPU_REG, dst); + CopyHostReg(dstreg, srcreg); + ClearHostRegNeeded(dstreg); + } + } + + // TODO: This could be made better if we only did it for registers where there was a previous MFC2. + if (g_settings.gpu_pgxp_enable && pgxp_move) + { + // might've been renamed, so use dst here + GeneratePGXPCallWithMIPSRegs(reinterpret_cast(&PGXP::CPU_MOVE), + (static_cast(dst) << 8) | (static_cast(src)), dst); + } +} + +void CPU::NewRec::Compiler::Compile_j() +{ + const u32 newpc = (m_compiler_pc & UINT32_C(0xF0000000)) | (inst->j.target << 2); + + // TODO: Delay slot swap. + // We could also move the cycle commit back. + CompileBranchDelaySlot(); + EndBlock(newpc, true); +} + +void CPU::NewRec::Compiler::Compile_jr_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + const u32 newpc = GetConstantRegU32(cf.MipsS()); + if (newpc & 3 && g_settings.cpu_recompiler_memory_exceptions) + { + EndBlockWithException(Exception::AdEL); + return; + } + + CompileBranchDelaySlot(); + EndBlock(newpc, true); +} + +void CPU::NewRec::Compiler::Compile_jal() +{ + const u32 newpc = (m_compiler_pc & UINT32_C(0xF0000000)) | (inst->j.target << 2); + SetConstantReg(Reg::ra, GetBranchReturnAddress({})); + CompileBranchDelaySlot(); + EndBlock(newpc, true); +} + +void CPU::NewRec::Compiler::Compile_jalr_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + const u32 newpc = GetConstantRegU32(cf.MipsS()); + if (MipsD() != Reg::zero) + SetConstantReg(MipsD(), GetBranchReturnAddress({})); + + CompileBranchDelaySlot(); + EndBlock(newpc, true); +} + +void CPU::NewRec::Compiler::Compile_syscall() +{ + EndBlockWithException(Exception::Syscall); +} + +void CPU::NewRec::Compiler::Compile_break() +{ + EndBlockWithException(Exception::BP); +} + +void CPU::NewRec::Compiler::Compile_b_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + + const u8 irt = static_cast(inst->i.rt.GetValue()); + const bool bgez = ConvertToBoolUnchecked(irt & u8(1)); + const bool link = (irt & u8(0x1E)) == u8(0x10); + + const s32 rs = GetConstantRegS32(cf.MipsS()); + const bool taken = bgez ? (rs >= 0) : (rs < 0); + const u32 taken_pc = GetConditionalBranchTarget(cf); + + if (link) + SetConstantReg(Reg::ra, GetBranchReturnAddress(cf)); + + CompileBranchDelaySlot(); + EndBlock(taken ? taken_pc : m_compiler_pc, true); +} + +void CPU::NewRec::Compiler::Compile_b(CompileFlags cf) +{ + const u8 irt = static_cast(inst->i.rt.GetValue()); + const bool bgez = ConvertToBoolUnchecked(irt & u8(1)); + const bool link = (irt & u8(0x1E)) == u8(0x10); + + if (link) + SetConstantReg(Reg::ra, GetBranchReturnAddress(cf)); + + Compile_bxx(cf, bgez ? BranchCondition::GreaterEqualZero : BranchCondition::LessThanZero); +} + +void CPU::NewRec::Compiler::Compile_blez(CompileFlags cf) +{ + Compile_bxx(cf, BranchCondition::LessEqualZero); +} + +void CPU::NewRec::Compiler::Compile_blez_const(CompileFlags cf) +{ + Compile_bxx_const(cf, BranchCondition::LessEqualZero); +} + +void CPU::NewRec::Compiler::Compile_bgtz(CompileFlags cf) +{ + Compile_bxx(cf, BranchCondition::GreaterThanZero); +} + +void CPU::NewRec::Compiler::Compile_bgtz_const(CompileFlags cf) +{ + Compile_bxx_const(cf, BranchCondition::GreaterThanZero); +} + +void CPU::NewRec::Compiler::Compile_beq(CompileFlags cf) +{ + Compile_bxx(cf, BranchCondition::Equal); +} + +void CPU::NewRec::Compiler::Compile_beq_const(CompileFlags cf) +{ + Compile_bxx_const(cf, BranchCondition::Equal); +} + +void CPU::NewRec::Compiler::Compile_bne(CompileFlags cf) +{ + Compile_bxx(cf, BranchCondition::NotEqual); +} + +void CPU::NewRec::Compiler::Compile_bne_const(CompileFlags cf) +{ + Compile_bxx_const(cf, BranchCondition::NotEqual); +} + +void CPU::NewRec::Compiler::Compile_bxx_const(CompileFlags cf, BranchCondition cond) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + + bool taken; + switch (cond) + { + case BranchCondition::Equal: + taken = GetConstantRegU32(cf.MipsS()) == GetConstantRegU32(cf.MipsT()); + break; + + case BranchCondition::NotEqual: + taken = GetConstantRegU32(cf.MipsS()) != GetConstantRegU32(cf.MipsT()); + break; + + case BranchCondition::GreaterThanZero: + taken = GetConstantRegS32(cf.MipsS()) > 0; + break; + + case BranchCondition::GreaterEqualZero: + taken = GetConstantRegS32(cf.MipsS()) >= 0; + break; + + case BranchCondition::LessThanZero: + taken = GetConstantRegS32(cf.MipsS()) < 0; + break; + + case BranchCondition::LessEqualZero: + taken = GetConstantRegS32(cf.MipsS()) <= 0; + break; + + default: + Panic("Unhandled condition"); + return; + } + + const u32 taken_pc = GetConditionalBranchTarget(cf); + CompileBranchDelaySlot(); + EndBlock(taken ? taken_pc : m_compiler_pc, true); +} + +void CPU::NewRec::Compiler::Compile_sll_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) << inst->r.shamt); +} + +void CPU::NewRec::Compiler::Compile_srl_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) >> inst->r.shamt); +} + +void CPU::NewRec::Compiler::Compile_sra_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), static_cast(GetConstantRegS32(cf.MipsT()) >> inst->r.shamt)); +} + +void CPU::NewRec::Compiler::Compile_sllv_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) << (GetConstantRegU32(cf.MipsS()) & 0x1Fu)); +} + +void CPU::NewRec::Compiler::Compile_srlv_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) >> (GetConstantRegU32(cf.MipsS()) & 0x1Fu)); +} + +void CPU::NewRec::Compiler::Compile_srav_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), static_cast(GetConstantRegS32(cf.MipsT()) >> (GetConstantRegU32(cf.MipsS()) & 0x1Fu))); +} + +void CPU::NewRec::Compiler::Compile_and_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) & GetConstantRegU32(cf.MipsT())); +} + +void CPU::NewRec::Compiler::Compile_or_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) | GetConstantRegU32(cf.MipsT())); +} + +void CPU::NewRec::Compiler::Compile_xor_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) ^ GetConstantRegU32(cf.MipsT())); +} + +void CPU::NewRec::Compiler::Compile_nor_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), ~(GetConstantRegU32(cf.MipsS()) | GetConstantRegU32(cf.MipsT()))); +} + +void CPU::NewRec::Compiler::Compile_slt_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), BoolToUInt32(GetConstantRegS32(cf.MipsS()) < GetConstantRegS32(cf.MipsT()))); +} + +void CPU::NewRec::Compiler::Compile_sltu_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), BoolToUInt32(GetConstantRegU32(cf.MipsS()) < GetConstantRegU32(cf.MipsT()))); +} + +void CPU::NewRec::Compiler::Compile_mult_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + + const u64 res = + static_cast(static_cast(GetConstantRegS32(cf.MipsS())) * static_cast(GetConstantRegS32(cf.MipsT()))); + SetConstantReg(Reg::hi, static_cast(res >> 32)); + SetConstantReg(Reg::lo, static_cast(res)); +} + +void CPU::NewRec::Compiler::Compile_multu_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + + const u64 res = static_cast(GetConstantRegU32(cf.MipsS())) * static_cast(GetConstantRegU32(cf.MipsT())); + SetConstantReg(Reg::hi, static_cast(res >> 32)); + SetConstantReg(Reg::lo, static_cast(res)); +} + +void CPU::NewRec::Compiler::Compile_div_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + + const s32 num = GetConstantRegS32(cf.MipsS()); + const s32 denom = GetConstantRegS32(cf.MipsT()); + + s32 lo, hi; + if (denom == 0) + { + // divide by zero + lo = (num >= 0) ? UINT32_C(0xFFFFFFFF) : UINT32_C(1); + hi = static_cast(num); + } + else if (static_cast(num) == UINT32_C(0x80000000) && denom == -1) + { + // unrepresentable + lo = UINT32_C(0x80000000); + hi = 0; + } + else + { + lo = num / denom; + hi = num % denom; + } + + SetConstantReg(Reg::hi, hi); + SetConstantReg(Reg::lo, lo); +} + +void CPU::NewRec::Compiler::Compile_divu_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + + const u32 num = GetConstantRegU32(cf.MipsS()); + const u32 denom = GetConstantRegU32(cf.MipsT()); + + u32 lo, hi; + + if (denom == 0) + { + // divide by zero + lo = UINT32_C(0xFFFFFFFF); + hi = static_cast(num); + } + else + { + lo = num / denom; + hi = num % denom; + } + + SetConstantReg(Reg::hi, hi); + SetConstantReg(Reg::lo, lo); +} + +void CPU::NewRec::Compiler::Compile_add_const(CompileFlags cf) +{ + // TODO: Overflow + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + if (MipsD() != Reg::zero) + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) + GetConstantRegU32(cf.MipsT())); +} + +void CPU::NewRec::Compiler::Compile_addu_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) + GetConstantRegU32(cf.MipsT())); +} + +void CPU::NewRec::Compiler::Compile_sub_const(CompileFlags cf) +{ + // TODO: Overflow + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + if (MipsD() != Reg::zero) + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) - GetConstantRegU32(cf.MipsT())); +} + +void CPU::NewRec::Compiler::Compile_subu_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT())); + SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) - GetConstantRegU32(cf.MipsT())); +} + +void CPU::NewRec::Compiler::Compile_addi_const(CompileFlags cf) +{ + // TODO: Overflow + DebugAssert(HasConstantReg(cf.MipsS())); + if (cf.MipsT() != Reg::zero) + SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) + inst->i.imm_sext32()); +} + +void CPU::NewRec::Compiler::Compile_addiu_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) + inst->i.imm_sext32()); +} + +void CPU::NewRec::Compiler::Compile_slti_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + SetConstantReg(cf.MipsT(), BoolToUInt32(GetConstantRegS32(cf.MipsS()) < static_cast(inst->i.imm_sext32()))); +} + +void CPU::NewRec::Compiler::Compile_sltiu_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) < inst->i.imm_sext32()); +} + +void CPU::NewRec::Compiler::Compile_andi_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) & inst->i.imm_zext32()); +} + +void CPU::NewRec::Compiler::Compile_ori_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) | inst->i.imm_zext32()); +} + +void CPU::NewRec::Compiler::Compile_xori_const(CompileFlags cf) +{ + DebugAssert(HasConstantReg(cf.MipsS())); + SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) ^ inst->i.imm_zext32()); +} + +void CPU::NewRec::Compiler::Compile_lui() +{ + if (inst->i.rt == Reg::zero) + return; + + SetConstantReg(inst->i.rt, inst->i.imm_zext32() << 16); +} + +static constexpr const std::array, 16> s_cop0_table = { + {{nullptr, 0x00000000u}, + {nullptr, 0x00000000u}, + {nullptr, 0x00000000u}, + {&CPU::g_state.cop0_regs.BPC, 0xffffffffu}, + {nullptr, 0}, + {&CPU::g_state.cop0_regs.BDA, 0xffffffffu}, + {&CPU::g_state.cop0_regs.TAR, 0x00000000u}, + {&CPU::g_state.cop0_regs.dcic.bits, CPU::Cop0Registers::DCIC::WRITE_MASK}, + {&CPU::g_state.cop0_regs.BadVaddr, 0x00000000u}, + {&CPU::g_state.cop0_regs.BDAM, 0xffffffffu}, + {nullptr, 0x00000000u}, + {&CPU::g_state.cop0_regs.BPCM, 0xffffffffu}, + {&CPU::g_state.cop0_regs.sr.bits, CPU::Cop0Registers::SR::WRITE_MASK}, + {&CPU::g_state.cop0_regs.cause.bits, CPU::Cop0Registers::CAUSE::WRITE_MASK}, + {&CPU::g_state.cop0_regs.EPC, 0x00000000u}, + {&CPU::g_state.cop0_regs.PRID, 0x00000000u}}}; + +u32* CPU::NewRec::Compiler::GetCop0RegPtr(Cop0Reg reg) +{ + return (static_cast(reg) < s_cop0_table.size()) ? s_cop0_table[static_cast(reg)].first : nullptr; +} + +u32 CPU::NewRec::Compiler::GetCop0RegWriteMask(Cop0Reg reg) +{ + return (static_cast(reg) < s_cop0_table.size()) ? s_cop0_table[static_cast(reg)].second : 0; +} + +void CPU::NewRec::Compiler::Compile_mfc0(CompileFlags cf) +{ + const Cop0Reg r = static_cast(MipsD()); + const u32* ptr = GetCop0RegPtr(r); + if (!ptr) + { + Log_ErrorPrintf("Read from unknown cop0 reg %u", static_cast(r)); + Compile_Fallback(); + return; + } + + DebugAssert(cf.valid_host_t); + LoadHostRegFromCPUPointer(cf.host_t, ptr); +} + +std::pair +CPU::NewRec::Compiler::GetGTERegisterPointer(u32 index, bool writing) +{ + if (!writing) + { + // Most GTE registers can be read directly. Handle the special cases here. + if (index == 15) // SXY3 + { + // mirror of SXY2 + index = 14; + } + + switch (index) + { + case 28: // IRGB + case 29: // ORGB + { + return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::CallHandler); + } + break; + + default: + { + return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Direct); + } + break; + } + } + else + { + switch (index) + { + case 1: // V0[z] + case 3: // V1[z] + case 5: // V2[z] + case 8: // IR0 + case 9: // IR1 + case 10: // IR2 + case 11: // IR3 + case 36: // RT33 + case 44: // L33 + case 52: // LR33 + case 58: // H - sign-extended on read but zext on use + case 59: // DQA + case 61: // ZSF3 + case 62: // ZSF4 + { + // sign-extend z component of vector registers + return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::SignExtend16); + } + break; + + case 7: // OTZ + case 16: // SZ0 + case 17: // SZ1 + case 18: // SZ2 + case 19: // SZ3 + { + // zero-extend unsigned values + return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::ZeroExtend16); + } + break; + + case 15: // SXY3 + { + // writing to SXYP pushes to the FIFO + return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::PushFIFO); + } + break; + + case 28: // IRGB + case 30: // LZCS + case 63: // FLAG + { + return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::CallHandler); + } + + case 29: // ORGB + case 31: // LZCR + { + // read-only registers + return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Ignore); + } + + default: + { + // written as-is, 2x16 or 1x32 bits + return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Direct); + } + } + } +} + +void CPU::NewRec::Compiler::AddGTETicks(TickCount ticks) +{ + // TODO: check, int has +1 here + m_gte_done_cycle = m_cycles + ticks; + Log_DebugPrintf("Adding %d GTE ticks", ticks); +} + +void CPU::NewRec::Compiler::StallUntilGTEComplete() +{ + // TODO: hack to match old rec.. this may or may not be correct behavior + // it's the difference between stalling before and after the current instruction's cycle + DebugAssert(m_cycles > 0); + m_cycles--; + + if (!m_dirty_gte_done_cycle) + { + // simple case - in block scheduling + if (m_gte_done_cycle > m_cycles) + { + Log_DebugPrintf("Stalling for %d ticks from GTE", m_gte_done_cycle - m_cycles); + m_cycles += (m_gte_done_cycle - m_cycles); + } + } + else + { + // switch to in block scheduling + Log_DebugPrintf("Flushing GTE stall from state"); + Flush(FLUSH_GTE_STALL_FROM_STATE); + } + + m_cycles++; +} + +void CPU::NewRec::BackpatchLoadStore(void* exception_pc, const CodeCache::LoadstoreBackpatchInfo& info) +{ + // remove the cycles we added for the memory read, then take them off again after the backpatch + // the normal rec path will add the ram read ticks later, so we need to take them off at the end + DebugAssert(!info.is_load || info.cycles >= Bus::RAM_READ_TICKS); + const TickCount cycles_to_add = + static_cast(static_cast(info.cycles)) - (info.is_load ? Bus::RAM_READ_TICKS : 0); + const TickCount cycles_to_remove = static_cast(static_cast(info.cycles)); + + JitCodeBuffer& buffer = CodeCache::GetCodeBuffer(); + void* thunk_address = buffer.GetFreeFarCodePointer(); + const u32 thunk_size = CompileLoadStoreThunk( + thunk_address, buffer.GetFreeFarCodeSpace(), exception_pc, info.code_size, cycles_to_add, cycles_to_remove, + info.gpr_bitmask, info.address_register, info.data_register, info.AccessSize(), info.is_signed, info.is_load); + +#if 0 + Log_DebugPrintf("**Backpatch Thunk**"); + CodeCache::DisassembleAndLogHostCode(thunk_address, thunk_size); +#endif + + // backpatch to a jump to the slowmem handler + CodeCache::EmitJump(exception_pc, thunk_address, true); + + buffer.CommitFarCode(thunk_size); +} diff --git a/src/core/cpu_newrec_compiler.h b/src/core/cpu_newrec_compiler.h new file mode 100644 index 000000000..7781006cf --- /dev/null +++ b/src/core/cpu_newrec_compiler.h @@ -0,0 +1,465 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once +#include "cpu_code_cache_private.h" +#include "cpu_recompiler_types.h" +#include "cpu_types.h" +#include +#include +#include +#include +#include + +namespace CPU::NewRec { + +// Global options +static constexpr bool EMULATE_LOAD_DELAYS = true; +static constexpr bool SWAP_BRANCH_DELAY_SLOTS = true; + +// Arch-specific options +#if defined(CPU_ARCH_X64) +static constexpr u32 NUM_HOST_REGS = 16; +static constexpr bool HAS_MEMORY_OPERANDS = true; +#elif defined(CPU_ARCH_ARM64) +static constexpr u32 NUM_HOST_REGS = 32; +static constexpr bool HAS_MEMORY_OPERANDS = false; +#elif defined(CPU_ARCH_RISCV64) +static constexpr u32 NUM_HOST_REGS = 32; +static constexpr bool HAS_MEMORY_OPERANDS = false; +#endif + +// TODO: Get rid of the virtuals... somehow. +class Compiler +{ +public: + Compiler(); + virtual ~Compiler(); + + const void* CompileBlock(CodeCache::Block* block, u32* host_code_size, u32* host_far_code_size); + +protected: + enum FlushFlags : u32 + { + FLUSH_FLUSH_MIPS_REGISTERS = (1 << 0), + FLUSH_INVALIDATE_MIPS_REGISTERS = (1 << 1), + FLUSH_FREE_CALLER_SAVED_REGISTERS = (1 << 2), + FLUSH_FREE_UNNEEDED_CALLER_SAVED_REGISTERS = (1 << 3), + FLUSH_FREE_ALL_REGISTERS = (1 << 4), + FLUSH_PC = (1 << 5), + FLUSH_INSTRUCTION_BITS = (1 << 6), + FLUSH_CYCLES = (1 << 7), + FLUSH_LOAD_DELAY = (1 << 8), + FLUSH_LOAD_DELAY_FROM_STATE = (1 << 9), + FLUSH_GTE_DONE_CYCLE = (1 << 10), + FLUSH_GTE_STALL_FROM_STATE = (1 << 11), + + FLUSH_FOR_C_CALL = (FLUSH_FREE_CALLER_SAVED_REGISTERS), + FLUSH_FOR_LOADSTORE = (FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_CYCLES), + FLUSH_FOR_BRANCH = (FLUSH_FLUSH_MIPS_REGISTERS), + FLUSH_FOR_EXCEPTION = + (FLUSH_CYCLES | FLUSH_GTE_DONE_CYCLE), // GTE cycles needed because it stalls when a GTE instruction is next. + FLUSH_FOR_INTERPRETER = + (FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_INVALIDATE_MIPS_REGISTERS | FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_PC | + FLUSH_CYCLES | FLUSH_INSTRUCTION_BITS | FLUSH_LOAD_DELAY | FLUSH_GTE_DONE_CYCLE), + FLUSH_END_BLOCK = 0xFFFFFFFFu & ~(FLUSH_PC | FLUSH_CYCLES | FLUSH_GTE_DONE_CYCLE | FLUSH_INSTRUCTION_BITS | + FLUSH_GTE_STALL_FROM_STATE), + }; + + union CompileFlags + { + struct + { + u32 const_s : 1; // S is constant + u32 const_t : 1; // T is constant + u32 const_lo : 1; // LO is constant + u32 const_hi : 1; // HI is constant + + u32 valid_host_d : 1; // D is valid in host register + u32 valid_host_s : 1; // S is valid in host register + u32 valid_host_t : 1; // T is valid in host register + u32 valid_host_lo : 1; // LO is valid in host register + u32 valid_host_hi : 1; // HI is valid in host register + + u32 host_d : 5; // D host register + u32 host_s : 5; // S host register + u32 host_t : 5; // T host register + u32 host_lo : 5; // LO host register + + u32 delay_slot_swapped : 1; + u32 pad1 : 2; // 28..31 + + u32 host_hi : 5; // HI host register + + u32 mips_s : 5; // S guest register + u32 mips_t : 5; // T guest register + + u32 pad2 : 15; // 32 bits + }; + + u64 bits; + + ALWAYS_INLINE Reg MipsS() const { return static_cast(mips_s); } + ALWAYS_INLINE Reg MipsT() const { return static_cast(mips_t); } + }; + static_assert(sizeof(CompileFlags) == sizeof(u64)); + + enum TemplateFlag : u32 + { + TF_READS_S = (1 << 0), + TF_READS_T = (1 << 1), + TF_READS_LO = (1 << 2), + TF_READS_HI = (1 << 3), + TF_WRITES_D = (1 << 4), + TF_WRITES_T = (1 << 5), + TF_WRITES_LO = (1 << 6), + TF_WRITES_HI = (1 << 7), + TF_COMMUTATIVE = (1 << 8), // S op T == T op S + TF_CAN_OVERFLOW = (1 << 9), + + // TF_NORENAME = // TODO + TF_LOAD_DELAY = (1 << 10), + TF_GTE_STALL = (1 << 11), + + TF_NO_NOP = (1 << 12), + TF_NEEDS_REG_S = (1 << 13), + TF_NEEDS_REG_T = (1 << 14), + TF_CAN_SWAP_DELAY_SLOT = (1 << 15), + + TF_RENAME_WITH_ZERO_T = (1 << 16), // add commutative for S as well + TF_RENAME_WITH_ZERO_IMM = (1 << 17), + + TF_PGXP_WITHOUT_CPU = (1 << 18), + }; + + enum HostRegFlags : u8 + { + HR_ALLOCATED = (1 << 0), + HR_NEEDED = (1 << 1), + HR_MODE_READ = (1 << 2), // valid + HR_MODE_WRITE = (1 << 3), // dirty + + HR_USABLE = (1 << 7), + HR_CALLEE_SAVED = (1 << 6), + + ALLOWED_HR_FLAGS = HR_MODE_READ | HR_MODE_WRITE, + IMMUTABLE_HR_FLAGS = HR_USABLE | HR_CALLEE_SAVED, + }; + + enum HostRegAllocType : u8 + { + HR_TYPE_TEMP, + HR_TYPE_CPU_REG, + HR_TYPE_PC_WRITEBACK, + HR_TYPE_LOAD_DELAY_VALUE, + HR_TYPE_NEXT_LOAD_DELAY_VALUE, + }; + + struct HostRegAlloc + { + u8 flags; + HostRegAllocType type; + Reg reg; + u16 counter; + }; + + enum class BranchCondition : u8 + { + Equal, + NotEqual, + GreaterThanZero, + GreaterEqualZero, + LessThanZero, + LessEqualZero, + }; + + ALWAYS_INLINE bool HasConstantReg(Reg r) const { return m_constant_regs_valid.test(static_cast(r)); } + ALWAYS_INLINE bool HasDirtyConstantReg(Reg r) const { return m_constant_regs_dirty.test(static_cast(r)); } + ALWAYS_INLINE bool HasConstantRegValue(Reg r, u32 val) const + { + return m_constant_regs_valid.test(static_cast(r)) && m_constant_reg_values[static_cast(r)] == val; + } + ALWAYS_INLINE u32 GetConstantRegU32(Reg r) const { return m_constant_reg_values[static_cast(r)]; } + ALWAYS_INLINE s32 GetConstantRegS32(Reg r) const + { + return static_cast(m_constant_reg_values[static_cast(r)]); + } + void SetConstantReg(Reg r, u32 v); + void ClearConstantReg(Reg r); + void FlushConstantReg(Reg r); + void FlushConstantRegs(bool invalidate); + + Reg MipsD() const; + u32 GetConditionalBranchTarget(CompileFlags cf) const; + u32 GetBranchReturnAddress(CompileFlags cf) const; + bool TrySwapDelaySlot(Reg rs = Reg::zero, Reg rt = Reg::zero, Reg rd = Reg::zero); + void SetCompilerPC(u32 newpc); + + virtual const void* GetCurrentCodePointer() = 0; + + virtual void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer, + u32 far_code_space); + virtual void BeginBlock(); + virtual void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) = 0; + virtual void GenerateICacheCheckAndUpdate() = 0; + virtual void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) = 0; + virtual void EndBlock(const std::optional& newpc, bool do_event_test) = 0; + virtual void EndBlockWithException(Exception excode) = 0; + virtual const void* EndCompile(u32* code_size, u32* far_code_size) = 0; + + ALWAYS_INLINE bool IsHostRegAllocated(u32 r) const { return (m_host_regs[r].flags & HR_ALLOCATED) != 0; } + static const char* GetReadWriteModeString(u32 flags); + virtual const char* GetHostRegName(u32 reg) const = 0; + u32 GetFreeHostReg(u32 flags); + u32 AllocateHostReg(u32 flags, HostRegAllocType type = HR_TYPE_TEMP, Reg reg = Reg::count); + std::optional CheckHostReg(u32 flags, HostRegAllocType type = HR_TYPE_TEMP, Reg reg = Reg::count); + u32 AllocateTempHostReg(u32 flags = 0); + void SwapHostRegAlloc(u32 lhs, u32 rhs); + void FlushHostReg(u32 reg); + void FreeHostReg(u32 reg); + void ClearHostReg(u32 reg); + void MarkRegsNeeded(HostRegAllocType type, Reg reg); + void RenameHostReg(u32 reg, u32 new_flags, HostRegAllocType new_type, Reg new_reg); + void ClearHostRegNeeded(u32 reg); + void ClearHostRegsNeeded(); + void DeleteMIPSReg(Reg reg, bool flush); + bool TryRenameMIPSReg(Reg to, Reg from, u32 fromhost, Reg other); + void UpdateHostRegCounters(); + + virtual void LoadHostRegWithConstant(u32 reg, u32 val) = 0; + virtual void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) = 0; + virtual void StoreConstantToCPUPointer(u32 val, const void* ptr) = 0; + virtual void StoreHostRegToCPUPointer(u32 reg, const void* ptr) = 0; + virtual void CopyHostReg(u32 dst, u32 src) = 0; + virtual void Flush(u32 flags); + + /// Returns true if there is a load delay which will be stored at the end of the instruction. + bool HasLoadDelay() const { return m_load_delay_register != Reg::count; } + + /// Cancels any pending load delay to the specified register. + void CancelLoadDelaysToReg(Reg reg); + + /// Moves load delay to the next load delay, and writes any previous load delay to the destination register. + void UpdateLoadDelay(); + + /// Flushes the load delay, i.e. writes it to the destination register. + void FinishLoadDelay(); + + /// Flushes the load delay, but only if it matches the specified register. + void FinishLoadDelayToReg(Reg reg); + + /// Uses a caller-saved register for load delays when PGXP is enabled. + u32 GetFlagsForNewLoadDelayedReg() const; + + void BackupHostState(); + void RestoreHostState(); + + /// Registers loadstore for possible backpatching. + void AddLoadStoreInfo(void* code_address, u32 code_size, u32 address_register, u32 data_register, + MemoryAccessSize size, bool is_signed, bool is_load); + + void CompileInstruction(); + void CompileBranchDelaySlot(bool dirty_pc = true); + + void CompileTemplate(void (Compiler::*const_func)(CompileFlags), void (Compiler::*func)(CompileFlags), + const void* pgxp_cpu_func, u32 tflags); + void CompileLoadStoreTemplate(void (Compiler::*func)(CompileFlags, MemoryAccessSize, bool, + const std::optional&), + MemoryAccessSize size, bool store, bool sign, u32 tflags); + void FlushForLoadStore(const std::optional& address, bool store); + void CompileMoveRegTemplate(Reg dst, Reg src, bool pgxp_move); + + virtual void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count, + Reg arg3reg = Reg::count) = 0; + + virtual void Compile_Fallback() = 0; + + void Compile_j(); + virtual void Compile_jr(CompileFlags cf) = 0; + void Compile_jr_const(CompileFlags cf); + void Compile_jal(); + virtual void Compile_jalr(CompileFlags cf) = 0; + void Compile_jalr_const(CompileFlags cf); + void Compile_syscall(); + void Compile_break(); + + void Compile_b_const(CompileFlags cf); + void Compile_b(CompileFlags cf); + void Compile_blez(CompileFlags cf); + void Compile_blez_const(CompileFlags cf); + void Compile_bgtz(CompileFlags cf); + void Compile_bgtz_const(CompileFlags cf); + void Compile_beq(CompileFlags cf); + void Compile_beq_const(CompileFlags cf); + void Compile_bne(CompileFlags cf); + void Compile_bne_const(CompileFlags cf); + virtual void Compile_bxx(CompileFlags cf, BranchCondition cond) = 0; + void Compile_bxx_const(CompileFlags cf, BranchCondition cond); + + void Compile_sll_const(CompileFlags cf); + virtual void Compile_sll(CompileFlags cf) = 0; + void Compile_srl_const(CompileFlags cf); + virtual void Compile_srl(CompileFlags cf) = 0; + void Compile_sra_const(CompileFlags cf); + virtual void Compile_sra(CompileFlags cf) = 0; + void Compile_sllv_const(CompileFlags cf); + virtual void Compile_sllv(CompileFlags cf) = 0; + void Compile_srlv_const(CompileFlags cf); + virtual void Compile_srlv(CompileFlags cf) = 0; + void Compile_srav_const(CompileFlags cf); + virtual void Compile_srav(CompileFlags cf) = 0; + void Compile_mult_const(CompileFlags cf); + virtual void Compile_mult(CompileFlags cf) = 0; + void Compile_multu_const(CompileFlags cf); + virtual void Compile_multu(CompileFlags cf) = 0; + void Compile_div_const(CompileFlags cf); + virtual void Compile_div(CompileFlags cf) = 0; + void Compile_divu_const(CompileFlags cf); + virtual void Compile_divu(CompileFlags cf) = 0; + void Compile_add_const(CompileFlags cf); + virtual void Compile_add(CompileFlags cf) = 0; + void Compile_addu_const(CompileFlags cf); + virtual void Compile_addu(CompileFlags cf) = 0; + void Compile_sub_const(CompileFlags cf); + virtual void Compile_sub(CompileFlags cf) = 0; + void Compile_subu_const(CompileFlags cf); + virtual void Compile_subu(CompileFlags cf) = 0; + void Compile_and_const(CompileFlags cf); + virtual void Compile_and(CompileFlags cf) = 0; + void Compile_or_const(CompileFlags cf); + virtual void Compile_or(CompileFlags cf) = 0; + void Compile_xor_const(CompileFlags cf); + virtual void Compile_xor(CompileFlags cf) = 0; + void Compile_nor_const(CompileFlags cf); + virtual void Compile_nor(CompileFlags cf) = 0; + void Compile_slt_const(CompileFlags cf); + virtual void Compile_slt(CompileFlags cf) = 0; + void Compile_sltu_const(CompileFlags cf); + virtual void Compile_sltu(CompileFlags cf) = 0; + + void Compile_addi_const(CompileFlags cf); + virtual void Compile_addi(CompileFlags cf) = 0; + void Compile_addiu_const(CompileFlags cf); + virtual void Compile_addiu(CompileFlags cf) = 0; + void Compile_slti_const(CompileFlags cf); + virtual void Compile_slti(CompileFlags cf) = 0; + void Compile_sltiu_const(CompileFlags cf); + virtual void Compile_sltiu(CompileFlags cf) = 0; + void Compile_andi_const(CompileFlags cf); + virtual void Compile_andi(CompileFlags cf) = 0; + void Compile_ori_const(CompileFlags cf); + virtual void Compile_ori(CompileFlags cf) = 0; + void Compile_xori_const(CompileFlags cf); + virtual void Compile_xori(CompileFlags cf) = 0; + void Compile_lui(); + + virtual void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) = 0; + virtual void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) = 0; // lwl/lwr + virtual void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) = 0; + virtual void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) = 0; + virtual void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) = 0; // swl/swr + virtual void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) = 0; + + static u32* GetCop0RegPtr(Cop0Reg reg); + static u32 GetCop0RegWriteMask(Cop0Reg reg); + + void Compile_mfc0(CompileFlags cf); + virtual void Compile_mtc0(CompileFlags cf) = 0; + virtual void Compile_rfe(CompileFlags cf) = 0; + + void AddGTETicks(TickCount ticks); + void StallUntilGTEComplete(); + virtual void Compile_mfc2(CompileFlags cf) = 0; + virtual void Compile_mtc2(CompileFlags cf) = 0; + virtual void Compile_cop2(CompileFlags cf) = 0; + + enum GTERegisterAccessAction : u8 + { + Ignore, + Direct, + ZeroExtend16, + SignExtend16, + CallHandler, + PushFIFO, + }; + + static std::pair GetGTERegisterPointer(u32 index, bool writing); + + CodeCache::Block* m_block = nullptr; + u32 m_compiler_pc = 0; + TickCount m_cycles = 0; + TickCount m_gte_done_cycle = 0; + + const Instruction* inst = nullptr; + const CodeCache::InstructionInfo* iinfo = nullptr; + u32 m_current_instruction_pc = 0; + bool m_current_instruction_branch_delay_slot = false; + bool m_branch_delay_slot_swapped = false; + + bool m_dirty_pc = false; + bool m_dirty_instruction_bits = false; + bool m_dirty_gte_done_cycle = false; + bool m_block_ended = false; + + std::bitset(Reg::count)> m_constant_regs_valid = {}; + std::bitset(Reg::count)> m_constant_regs_dirty = {}; + std::array(Reg::count)> m_constant_reg_values = {}; + + std::array m_host_regs = {}; + u16 m_register_alloc_counter = 0; + + bool m_load_delay_dirty = true; + Reg m_load_delay_register = Reg::count; + u32 m_load_delay_value_register = 0; + + Reg m_next_load_delay_register = Reg::count; + u32 m_next_load_delay_value_register = 0; + + struct HostStateBackup + { + TickCount cycles; + TickCount gte_done_cycle; + u32 compiler_pc; + bool dirty_pc; + bool dirty_instruction_bits; + bool dirty_gte_done_cycle; + bool block_ended; + const Instruction* inst; + const CodeCache::InstructionInfo* iinfo; + u32 current_instruction_pc; + bool current_instruction_delay_slot; + std::bitset(Reg::count)> const_regs_valid; + std::bitset(Reg::count)> const_regs_dirty; + std::array(Reg::count)> const_regs_values; + std::array host_regs; + u16 register_alloc_counter; + bool load_delay_dirty; + Reg load_delay_register; + u32 load_delay_value_register; + Reg next_load_delay_register; + u32 next_load_delay_value_register; + }; + + // we need two of these, one for branch delays, and another if we have an overflow in the delay slot + std::array m_host_state_backup = {}; + u32 m_host_state_backup_count = 0; + + // PGXP memory callbacks + static const std::array, 3> s_pgxp_mem_load_functions; + static const std::array s_pgxp_mem_store_functions; +}; + +void BackpatchLoadStore(void* exception_pc, const CodeCache::LoadstoreBackpatchInfo& info); + +u32 CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size, TickCount cycles_to_add, + TickCount cycles_to_remove, u32 gpr_bitmask, u8 address_register, u8 data_register, + MemoryAccessSize size, bool is_signed, bool is_load); + +extern Compiler* g_compiler; +} // namespace CPU::NewRec diff --git a/src/core/cpu_newrec_compiler_aarch64.cpp b/src/core/cpu_newrec_compiler_aarch64.cpp new file mode 100644 index 000000000..4d05927bd --- /dev/null +++ b/src/core/cpu_newrec_compiler_aarch64.cpp @@ -0,0 +1,2235 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#include "cpu_newrec_compiler_aarch64.h" +#include "common/align.h" +#include "common/assert.h" +#include "common/log.h" +#include "common/string_util.h" +#include "cpu_core_private.h" +#include "cpu_recompiler_thunks.h" +#include "gte.h" +#include "pgxp.h" +#include "settings.h" +#include "timing_event.h" +#include +Log_SetChannel(CPU::NewRec); + +#define DUMP_BLOCKS + +#ifdef DUMP_BLOCKS +#include "vixl/aarch64/disasm-aarch64.h" +#endif + +using namespace vixl::aarch64; + +#define RWRET vixl::aarch64::w0 +#define RXRET vixl::aarch64::x0 +#define RWARG1 vixl::aarch64::w0 +#define RXARG1 vixl::aarch64::x0 +#define RWARG2 vixl::aarch64::w1 +#define RXARG2 vixl::aarch64::x1 +#define RWARG3 vixl::aarch64::w2 +#define RXARG3 vixl::aarch64::x2 +#define RWSCRATCH vixl::aarch64::w16 +#define RXSCRATCH vixl::aarch64::x16 +#define RSTATE vixl::aarch64::x19 +#define RMEMBASE vixl::aarch64::x20 + +#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (u32)(((u8*)(x)) - ((u8*)&g_state))) + +namespace CPU::NewRec { + +using CPU::Recompiler::armEmitCall; +using CPU::Recompiler::armEmitCondBranch; +using CPU::Recompiler::armEmitJmp; +using CPU::Recompiler::armEmitMov; +using CPU::Recompiler::armGetJumpTrampoline; +using CPU::Recompiler::armGetPCDisplacement; +using CPU::Recompiler::armIsCallerSavedRegister; +using CPU::Recompiler::armMoveAddressToReg; + +AArch64Compiler s_instance; +Compiler* g_compiler = &s_instance; + +} // namespace CPU::NewRec + +CPU::NewRec::AArch64Compiler::AArch64Compiler() = default; + +CPU::NewRec::AArch64Compiler::~AArch64Compiler() = default; + +const void* CPU::NewRec::AArch64Compiler::GetCurrentCodePointer() +{ + return armAsm->GetCursorAddress(); +} + +void CPU::NewRec::AArch64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, + u8* far_code_buffer, u32 far_code_space) +{ + Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space); + + // TODO: don't recreate this every time.. + DebugAssert(!m_emitter && !m_far_emitter && !armAsm); + m_emitter = std::make_unique(code_buffer, code_buffer_space, PositionDependentCode); + m_far_emitter = std::make_unique(far_code_buffer, far_code_space, PositionDependentCode); + armAsm = m_emitter.get(); + +#ifdef VIXL_DEBUG + m_emitter_check = std::make_unique(m_emitter.get(), code_buffer_space, + vixl::CodeBufferCheckScope::kDontReserveBufferSpace); + m_far_emitter_check = std::make_unique( + m_far_emitter.get(), far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace); +#endif + + // Need to wipe it out so it's correct when toggling fastmem. + m_host_regs = {}; + + const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + + if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() || + i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i >= 30) + { + continue; + } + + ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED); + } +} + +void CPU::NewRec::AArch64Compiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond) +{ + DebugAssert(armAsm == m_emitter.get()); + if (emit_jump) + { + const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress()); + if (cond != Condition::al) + { + if (vixl::IsInt19(disp)) + { + armAsm->b(disp, cond); + } + else + { + Label skip; + armAsm->b(&skip, vixl::aarch64::InvertCondition(cond)); + armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress())); + armAsm->bind(&skip); + } + } + else + { + armAsm->b(disp); + } + } + armAsm = m_far_emitter.get(); +} + +void CPU::NewRec::AArch64Compiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit) +{ + const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress()); + if (vixl::IsInt14(disp)) + { + armAsm->tbnz(reg, bit, disp); + } + else + { + Label skip; + armAsm->tbz(reg, bit, &skip); + armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress())); + armAsm->bind(&skip); + } + + armAsm = m_far_emitter.get(); +} + +void CPU::NewRec::AArch64Compiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero) +{ + const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress()); + if (vixl::IsInt19(disp)) + { + nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp); + } + else + { + Label skip; + nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip); + armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress())); + armAsm->bind(&skip); + } + + armAsm = m_far_emitter.get(); +} + +void CPU::NewRec::AArch64Compiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond) +{ + DebugAssert(armAsm == m_far_emitter.get()); + if (emit_jump) + { + const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter->GetCursorAddress()); + (cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp); + } + armAsm = m_emitter.get(); +} + +void CPU::NewRec::AArch64Compiler::EmitMov(const vixl::aarch64::WRegister& dst, u32 val) +{ + armEmitMov(armAsm, dst, val); +} + +void CPU::NewRec::AArch64Compiler::EmitCall(const void* ptr, bool force_inline /*= false*/) +{ + armEmitCall(armAsm, ptr, force_inline); +} + +vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckAddSubConstant(s32 val) +{ + if (Assembler::IsImmAddSub(val)) + return vixl::aarch64::Operand(static_cast(val)); + + EmitMov(RWSCRATCH, static_cast(val)); + return vixl::aarch64::Operand(RWSCRATCH); +} + +vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckAddSubConstant(u32 val) +{ + return armCheckAddSubConstant(static_cast(val)); +} + +vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckCompareConstant(s32 val) +{ + if (Assembler::IsImmConditionalCompare(val)) + return vixl::aarch64::Operand(static_cast(val)); + + EmitMov(RWSCRATCH, static_cast(val)); + return vixl::aarch64::Operand(RWSCRATCH); +} + +vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckLogicalConstant(u32 val) +{ + if (Assembler::IsImmLogical(val, 32)) + return vixl::aarch64::Operand(static_cast(static_cast(val))); + + EmitMov(RWSCRATCH, val); + return vixl::aarch64::Operand(RWSCRATCH); +} + +void CPU::NewRec::AArch64Compiler::BeginBlock() +{ + Compiler::BeginBlock(); +} + +void CPU::NewRec::AArch64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) +{ + // store it first to reduce code size, because we can offset + armMoveAddressToReg(armAsm, RXARG1, ram_ptr); + armMoveAddressToReg(armAsm, RXARG2, shadow_ptr); + + bool first = true; + u32 offset = 0; + Label block_changed; + + while (size >= 16) + { + const VRegister vtmp = v2.V4S(); + const VRegister dst = first ? v0.V4S() : v1.V4S(); + armAsm->ldr(dst, MemOperand(RXARG1, offset)); + armAsm->ldr(vtmp, MemOperand(RXARG2, offset)); + armAsm->cmeq(dst, dst, vtmp); + if (!first) + armAsm->and_(dst.V16B(), dst.V16B(), vtmp.V16B()); + else + first = false; + + offset += 16; + size -= 16; + } + + if (!first) + { + // TODO: make sure this doesn't choke on ffffffff + armAsm->uminv(s0, v0.V4S()); + armAsm->fcmp(s0, 0.0); + armAsm->b(&block_changed, eq); + } + + while (size >= 8) + { + armAsm->ldr(RXARG3, MemOperand(RXARG1, offset)); + armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset)); + armAsm->cmp(RXARG3, RXSCRATCH); + armAsm->b(&block_changed, ne); + offset += 8; + size -= 8; + } + + while (size >= 4) + { + armAsm->ldr(RWARG3, MemOperand(RXARG1, offset)); + armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset)); + armAsm->cmp(RWARG3, RWSCRATCH); + armAsm->b(&block_changed, ne); + offset += 4; + size -= 4; + } + + DebugAssert(size == 0); + + Label block_unchanged; + armAsm->b(&block_unchanged); + armAsm->bind(&block_changed); + armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false); + armAsm->bind(&block_unchanged); +} + +void CPU::NewRec::AArch64Compiler::GenerateICacheCheckAndUpdate() +{ + if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1) + { + armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); + armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast(m_block->uncached_fetch_ticks))); + armAsm->str(RWARG1, PTR(&g_state.pending_ticks)); + } + else + { + const auto& ticks_reg = RWARG1; + const auto& current_tag_reg = RWARG2; + const auto& existing_tag_reg = RWARG3; + + VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK; + armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks)); + armEmitMov(armAsm, current_tag_reg, current_pc); + + for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) + { + const TickCount fill_ticks = GetICacheFillTicks(current_pc); + if (fill_ticks <= 0) + continue; + + const u32 line = GetICacheLine(current_pc); + const u32 offset = offsetof(State, icache_tags) + (line * sizeof(u32)); + + Label cache_hit; + armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset)); + armAsm->cmp(existing_tag_reg, current_tag_reg); + armAsm->b(&cache_hit, eq); + + armAsm->str(current_tag_reg, MemOperand(RSTATE, offset)); + armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast(fill_ticks))); + armAsm->bind(&cache_hit); + + if (i != (m_block->icache_line_count - 1)) + armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE)); + } + + armAsm->str(ticks_reg, PTR(&g_state.pending_ticks)); + } +} + +void CPU::NewRec::AArch64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/, + s32 arg3reg /*= -1*/) +{ + if (arg1reg >= 0 && arg1reg != static_cast(RXARG1.GetCode())) + armAsm->mov(RXARG1, XRegister(arg1reg)); + if (arg1reg >= 0 && arg2reg != static_cast(RXARG2.GetCode())) + armAsm->mov(RXARG2, XRegister(arg2reg)); + if (arg1reg >= 0 && arg3reg != static_cast(RXARG3.GetCode())) + armAsm->mov(RXARG3, XRegister(arg3reg)); + EmitCall(func); +} + +void CPU::NewRec::AArch64Compiler::EndBlock(const std::optional& newpc, bool do_event_test) +{ + if (newpc.has_value()) + { + if (m_dirty_pc || m_compiler_pc != newpc) + { + EmitMov(RWSCRATCH, newpc.value()); + armAsm->str(RWSCRATCH, PTR(&g_state.pc)); + } + } + m_dirty_pc = false; + + // flush regs + Flush(FLUSH_END_BLOCK); + EndAndLinkBlock(newpc, do_event_test); +} + +void CPU::NewRec::AArch64Compiler::EndBlockWithException(Exception excode) +{ + // flush regs, but not pc, it's going to get overwritten + // flush cycles because of the GTE instruction stuff... + Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION); + + // TODO: flush load delay + // TODO: break for pcdrv + + EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false, + inst->cop.cop_n)); + EmitMov(RWARG2, m_current_instruction_pc); + EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException))); + m_dirty_pc = false; + + EndAndLinkBlock(std::nullopt, true); +} + +void CPU::NewRec::AArch64Compiler::EndAndLinkBlock(const std::optional& newpc, bool do_event_test) +{ + // event test + // pc should've been flushed + DebugAssert(!m_dirty_pc); + + // TODO: try extracting this to a function + // TODO: move the cycle flush in here.. + + // save cycles for event test + const TickCount cycles = std::exchange(m_cycles, 0); + + // pending_ticks += cycles + // if (pending_ticks >= downcount) { dispatch_event(); } + if (do_event_test || m_gte_done_cycle > cycles || cycles > 0) + armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); + if (do_event_test) + armAsm->ldr(RWARG2, PTR(&g_state.downcount)); + if (cycles > 0) + armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles)); + if (m_gte_done_cycle > cycles) + { + armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles)); + armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick)); + } + if (do_event_test) + armAsm->cmp(RWARG1, RWARG2); + if (cycles > 0) + armAsm->str(RWARG1, PTR(&g_state.pending_ticks)); + if (do_event_test) + armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch); + + // jump to dispatcher or next block + if (!newpc.has_value()) + { + armEmitJmp(armAsm, CodeCache::g_dispatcher, false); + } + else + { + if (newpc.value() == m_block->pc) + { + // Special case: ourselves! No need to backlink then. + Log_DebugPrintf("Linking block at %08X to self", m_block->pc); + armEmitJmp(armAsm, armAsm->GetBuffer()->GetStartAddress(), true); + } + else + { + const void* target = CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress(), newpc.value()); + armEmitJmp(armAsm, target, true); + } + } + + m_block_ended = true; +} + +const void* CPU::NewRec::AArch64Compiler::EndCompile(u32* code_size, u32* far_code_size) +{ +#ifdef VIXL_DEBUG + m_emitter_check.reset(); + m_far_emitter_check.reset(); +#endif + + m_emitter->FinalizeCode(); + m_far_emitter->FinalizeCode(); + + u8* const code = m_emitter->GetBuffer()->GetStartAddress(); + *code_size = static_cast(m_emitter->GetCursorOffset()); + *far_code_size = static_cast(m_far_emitter->GetCursorOffset()); + armAsm = nullptr; + m_far_emitter.reset(); + m_emitter.reset(); + return code; +} + +const char* CPU::NewRec::AArch64Compiler::GetHostRegName(u32 reg) const +{ + static constexpr std::array reg64_names = { + {"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}}; + return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN"; +} + +void CPU::NewRec::AArch64Compiler::LoadHostRegWithConstant(u32 reg, u32 val) +{ + EmitMov(WRegister(reg), val); +} + +void CPU::NewRec::AArch64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr) +{ + armAsm->ldr(WRegister(reg), PTR(ptr)); +} + +void CPU::NewRec::AArch64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr) +{ + armAsm->str(WRegister(reg), PTR(ptr)); +} + +void CPU::NewRec::AArch64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr) +{ + if (val == 0) + { + armAsm->str(wzr, PTR(ptr)); + return; + } + + EmitMov(RWSCRATCH, val); + armAsm->str(RWSCRATCH, PTR(ptr)); +} + +void CPU::NewRec::AArch64Compiler::CopyHostReg(u32 dst, u32 src) +{ + if (src != dst) + armAsm->mov(WRegister(dst), WRegister(src)); +} + +void CPU::NewRec::AArch64Compiler::AssertRegOrConstS(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_s || cf.const_s); +} + +void CPU::NewRec::AArch64Compiler::AssertRegOrConstT(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_t || cf.const_t); +} + +vixl::aarch64::MemOperand CPU::NewRec::AArch64Compiler::MipsPtr(Reg r) const +{ + DebugAssert(r < Reg::count); + return PTR(&g_state.regs.r[static_cast(r)]); +} + +vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegD(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_d); + return WRegister(cf.host_d); +} + +vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegS(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_s); + return WRegister(cf.host_s); +} + +vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegT(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_t); + return WRegister(cf.host_t); +} + +vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegLO(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_lo); + return WRegister(cf.host_lo); +} + +vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegHI(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_hi); + return WRegister(cf.host_hi); +} + +void CPU::NewRec::AArch64Compiler::MoveSToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf) +{ + if (cf.valid_host_s) + { + if (cf.host_s != dst.GetCode()) + armAsm->mov(dst, WRegister(cf.host_s)); + } + else if (cf.const_s) + { + const u32 cv = GetConstantRegU32(cf.MipsS()); + if (cv == 0) + armAsm->mov(dst, wzr); + else + EmitMov(dst, cv); + } + else + { + Log_WarningPrintf("Hit memory path in MoveSToReg() for %s", GetRegName(cf.MipsS())); + armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s])); + } +} + +void CPU::NewRec::AArch64Compiler::MoveTToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf) +{ + if (cf.valid_host_t) + { + if (cf.host_t != dst.GetCode()) + armAsm->mov(dst, WRegister(cf.host_t)); + } + else if (cf.const_t) + { + const u32 cv = GetConstantRegU32(cf.MipsT()); + if (cv == 0) + armAsm->mov(dst, wzr); + else + EmitMov(dst, cv); + } + else + { + Log_WarningPrintf("Hit memory path in MoveTToReg() for %s", GetRegName(cf.MipsT())); + armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t])); + } +} + +void CPU::NewRec::AArch64Compiler::MoveMIPSRegToReg(const vixl::aarch64::WRegister& dst, Reg reg) +{ + DebugAssert(reg < Reg::count); + if (const std::optional hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg)) + armAsm->mov(dst, WRegister(hreg.value())); + else if (HasConstantReg(reg)) + EmitMov(dst, GetConstantRegU32(reg)); + else + armAsm->ldr(dst, MipsPtr(reg)); +} + +void CPU::NewRec::AArch64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, + Reg arg2reg /* = Reg::count */, + Reg arg3reg /* = Reg::count */) +{ + DebugAssert(g_settings.gpu_pgxp_enable); + + Flush(FLUSH_FOR_C_CALL); + + if (arg2reg != Reg::count) + MoveMIPSRegToReg(RWARG2, arg2reg); + if (arg3reg != Reg::count) + MoveMIPSRegToReg(RWARG3, arg3reg); + + EmitMov(RWARG1, arg1val); + EmitCall(func); +} + +void CPU::NewRec::AArch64Compiler::Flush(u32 flags) +{ + Compiler::Flush(flags); + + if (flags & FLUSH_PC && m_dirty_pc) + { + StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc); + m_dirty_pc = false; + } + + if (flags & FLUSH_INSTRUCTION_BITS) + { + // This sucks, but it's only used for fallbacks. + Panic("Not implemented"); + } + + if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty) + { + // This sucks :( + // TODO: make it a function? + armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg)); + armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value)); + EmitMov(RWSCRATCH, offsetof(CPU::State, regs.r[0])); + armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2)); + armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1)); + EmitMov(RWSCRATCH, static_cast(Reg::count)); + armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg)); + m_load_delay_dirty = false; + } + + if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count) + { + if (m_load_delay_value_register != NUM_HOST_REGS) + FreeHostReg(m_load_delay_value_register); + + EmitMov(RWSCRATCH, static_cast(m_load_delay_register)); + armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg)); + m_load_delay_register = Reg::count; + m_load_delay_dirty = true; + } + + if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle) + { + // May as well flush cycles while we're here. + // GTE spanning blocks is very rare, we _could_ disable this for speed. + armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); + armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick)); + if (m_cycles > 0) + { + armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles)); + m_cycles = 0; + } + armAsm->cmp(RWARG2, RWARG1); + armAsm->csel(RWARG1, RWARG2, RWARG1, hs); + armAsm->str(RWARG1, PTR(&g_state.pending_ticks)); + m_dirty_gte_done_cycle = false; + } + + if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles) + { + armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); + + // update cycles at the same time + if (flags & FLUSH_CYCLES && m_cycles > 0) + { + armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles)); + armAsm->str(RWARG1, PTR(&g_state.pending_ticks)); + m_gte_done_cycle -= m_cycles; + m_cycles = 0; + } + + armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle)); + armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick)); + m_gte_done_cycle = 0; + m_dirty_gte_done_cycle = true; + } + + if (flags & FLUSH_CYCLES && m_cycles > 0) + { + armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); + armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles)); + armAsm->str(RWARG1, PTR(&g_state.pending_ticks)); + m_gte_done_cycle = std::max(m_gte_done_cycle - m_cycles, 0); + m_cycles = 0; + } +} + +void CPU::NewRec::AArch64Compiler::Compile_Fallback() +{ + Flush(FLUSH_FOR_INTERPRETER); + +#if 0 + cg->call(&CPU::Recompiler::Thunks::InterpretInstruction); + + // TODO: make me less garbage + // TODO: this is wrong, it flushes the load delay on the same cycle when we return. + // but nothing should be going through here.. + Label no_load_delay; + cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]); + cg->cmp(RWARG1, static_cast(Reg::count)); + cg->je(no_load_delay, CodeGenerator::T_SHORT); + cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]); + cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1); + cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2); + cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast(Reg::count)); + cg->L(no_load_delay); + + m_load_delay_dirty = EMULATE_LOAD_DELAYS; +#else + Panic("Fixme"); +#endif +} + +void CPU::NewRec::AArch64Compiler::CheckBranchTarget(const vixl::aarch64::WRegister& pcreg) +{ + if (!g_settings.cpu_recompiler_memory_exceptions) + return; + + armAsm->tst(pcreg, armCheckLogicalConstant(0x3)); + SwitchToFarCode(true, ne); + + BackupHostState(); + EndBlockWithException(Exception::AdEL); + + RestoreHostState(); + SwitchToNearCode(false); +} + +void CPU::NewRec::AArch64Compiler::Compile_jr(CompileFlags cf) +{ + const WRegister pcreg = CFGetRegS(cf); + CheckBranchTarget(pcreg); + + armAsm->str(pcreg, PTR(&g_state.pc)); + + CompileBranchDelaySlot(false); + EndBlock(std::nullopt, true); +} + +void CPU::NewRec::AArch64Compiler::Compile_jalr(CompileFlags cf) +{ + const WRegister pcreg = CFGetRegS(cf); + if (MipsD() != Reg::zero) + SetConstantReg(MipsD(), GetBranchReturnAddress(cf)); + + CheckBranchTarget(pcreg); + armAsm->str(pcreg, PTR(&g_state.pc)); + + CompileBranchDelaySlot(false); + EndBlock(std::nullopt, true); +} + +void CPU::NewRec::AArch64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond) +{ + AssertRegOrConstS(cf); + + const u32 taken_pc = GetConditionalBranchTarget(cf); + + Flush(FLUSH_FOR_BRANCH); + + DebugAssert(cf.valid_host_s); + + // MipsT() here should equal zero for zero branches. + DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero); + + Label taken; + const WRegister rs = CFGetRegS(cf); + switch (cond) + { + case BranchCondition::Equal: + case BranchCondition::NotEqual: + { + AssertRegOrConstT(cf); + if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0)) + { + (cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken); + } + else + { + if (cf.valid_host_t) + armAsm->cmp(rs, CFGetRegT(cf)); + else if (cf.const_t) + armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT()))); + + armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne); + } + } + break; + + case BranchCondition::GreaterThanZero: + { + armAsm->cmp(rs, 0); + armAsm->b(&taken, gt); + } + break; + + case BranchCondition::GreaterEqualZero: + { + armAsm->cmp(rs, 0); + armAsm->b(&taken, ge); + } + break; + + case BranchCondition::LessThanZero: + { + armAsm->cmp(rs, 0); + armAsm->b(&taken, lt); + } + break; + + case BranchCondition::LessEqualZero: + { + armAsm->cmp(rs, 0); + armAsm->b(&taken, le); + } + break; + } + + BackupHostState(); + if (!cf.delay_slot_swapped) + CompileBranchDelaySlot(); + + EndBlock(m_compiler_pc, true); + + armAsm->bind(&taken); + + RestoreHostState(); + if (!cf.delay_slot_swapped) + CompileBranchDelaySlot(); + + EndBlock(taken_pc, true); +} + +void CPU::NewRec::AArch64Compiler::Compile_addi(CompileFlags cf, bool overflow) +{ + const WRegister rs = CFGetRegS(cf); + const WRegister rt = CFGetRegT(cf); + if (const u32 imm = inst->i.imm_sext32(); imm != 0) + { + if (!overflow) + { + armAsm->add(rt, rs, armCheckAddSubConstant(imm)); + } + else + { + armAsm->adds(rt, rs, armCheckAddSubConstant(imm)); + TestOverflow(rt); + } + } + else if (rt.GetCode() != rs.GetCode()) + { + armAsm->mov(rt, rs); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_addi(CompileFlags cf) +{ + Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions); +} + +void CPU::NewRec::AArch64Compiler::Compile_addiu(CompileFlags cf) +{ + Compile_addi(cf, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_slti(CompileFlags cf) +{ + Compile_slti(cf, true); +} + +void CPU::NewRec::AArch64Compiler::Compile_sltiu(CompileFlags cf) +{ + Compile_slti(cf, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_slti(CompileFlags cf, bool sign) +{ + armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast(inst->i.imm_sext32()))); + armAsm->cset(CFGetRegT(cf), sign ? lt : lo); +} + +void CPU::NewRec::AArch64Compiler::Compile_andi(CompileFlags cf) +{ + const WRegister rt = CFGetRegT(cf); + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm)); + else + armAsm->mov(rt, wzr); +} + +void CPU::NewRec::AArch64Compiler::Compile_ori(CompileFlags cf) +{ + const WRegister rt = CFGetRegT(cf); + const WRegister rs = CFGetRegS(cf); + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + armAsm->orr(rt, rs, armCheckLogicalConstant(imm)); + else if (rt.GetCode() != rs.GetCode()) + armAsm->mov(rt, rs); +} + +void CPU::NewRec::AArch64Compiler::Compile_xori(CompileFlags cf) +{ + const WRegister rt = CFGetRegT(cf); + const WRegister rs = CFGetRegS(cf); + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + armAsm->eor(rt, rs, armCheckLogicalConstant(imm)); + else if (rt.GetCode() != rs.GetCode()) + armAsm->mov(rt, rs); +} + +void CPU::NewRec::AArch64Compiler::Compile_shift(CompileFlags cf, + void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, + const vixl::aarch64::Register&, + unsigned)) +{ + const WRegister rd = CFGetRegD(cf); + const WRegister rt = CFGetRegT(cf); + if (inst->r.shamt > 0) + (armAsm->*op)(rd, rt, inst->r.shamt); + else if (rd.GetCode() != rt.GetCode()) + armAsm->mov(rd, rt); +} + +void CPU::NewRec::AArch64Compiler::Compile_sll(CompileFlags cf) +{ + Compile_shift(cf, &Assembler::lsl); +} + +void CPU::NewRec::AArch64Compiler::Compile_srl(CompileFlags cf) +{ + Compile_shift(cf, &Assembler::lsr); +} + +void CPU::NewRec::AArch64Compiler::Compile_sra(CompileFlags cf) +{ + Compile_shift(cf, &Assembler::asr); +} + +void CPU::NewRec::AArch64Compiler::Compile_variable_shift( + CompileFlags cf, + void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, + const vixl::aarch64::Register&), + void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned)) +{ + const WRegister rd = CFGetRegD(cf); + + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + if (cf.const_s) + { + if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0) + (armAsm->*op_const)(rd, rt, shift); + else if (rd.GetCode() != rt.GetCode()) + armAsm->mov(rd, rt); + } + else + { + (armAsm->*op)(rd, rt, CFGetRegS(cf)); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_sllv(CompileFlags cf) +{ + Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl); +} + +void CPU::NewRec::AArch64Compiler::Compile_srlv(CompileFlags cf) +{ + Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr); +} + +void CPU::NewRec::AArch64Compiler::Compile_srav(CompileFlags cf) +{ + Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr); +} + +void CPU::NewRec::AArch64Compiler::Compile_mult(CompileFlags cf, bool sign) +{ + const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; + if (!cf.valid_host_s) + MoveSToReg(rs, cf); + + const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + // TODO: if lo/hi gets killed, we can use a 32-bit multiply + const WRegister lo = CFGetRegLO(cf); + const WRegister hi = CFGetRegHI(cf); + + (sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt); + armAsm->lsr(hi.X(), lo.X(), 32); +} + +void CPU::NewRec::AArch64Compiler::Compile_mult(CompileFlags cf) +{ + Compile_mult(cf, true); +} + +void CPU::NewRec::AArch64Compiler::Compile_multu(CompileFlags cf) +{ + Compile_mult(cf, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_div(CompileFlags cf) +{ + const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; + if (!cf.valid_host_s) + MoveSToReg(rs, cf); + + const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + const WRegister rlo = CFGetRegLO(cf); + const WRegister rhi = CFGetRegHI(cf); + + // TODO: This could be slightly more optimal + Label done; + Label not_divide_by_zero; + armAsm->cbnz(rt, ¬_divide_by_zero); + armAsm->cmp(rs, 0); + armAsm->mov(rhi, rs); // hi = num + EmitMov(rlo, 1); + EmitMov(RWSCRATCH, static_cast(-1)); + armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 1 + armAsm->b(&done); + + armAsm->bind(¬_divide_by_zero); + Label not_unrepresentable; + armAsm->cmp(rs, armCheckCompareConstant(static_cast(0x80000000u))); + armAsm->b(¬_unrepresentable, ne); + armAsm->cmp(rt, armCheckCompareConstant(-1)); + armAsm->b(¬_unrepresentable, ne); + + EmitMov(rlo, 0x80000000u); + EmitMov(rhi, 0); + armAsm->b(&done); + + armAsm->bind(¬_unrepresentable); + + armAsm->sdiv(rlo, rs, rt); + + // TODO: skip when hi is dead + armAsm->msub(rhi, rlo, rt, rs); + + armAsm->bind(&done); +} + +void CPU::NewRec::AArch64Compiler::Compile_divu(CompileFlags cf) +{ + const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; + if (!cf.valid_host_s) + MoveSToReg(rs, cf); + + const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + const WRegister rlo = CFGetRegLO(cf); + const WRegister rhi = CFGetRegHI(cf); + + Label done; + Label not_divide_by_zero; + armAsm->cbnz(rt, ¬_divide_by_zero); + EmitMov(rlo, static_cast(-1)); + armAsm->mov(rhi, rs); + armAsm->b(&done); + + armAsm->bind(¬_divide_by_zero); + + armAsm->udiv(rlo, rs, rt); + + // TODO: skip when hi is dead + armAsm->msub(rhi, rlo, rt, rs); + + armAsm->bind(&done); +} + +void CPU::NewRec::AArch64Compiler::TestOverflow(const vixl::aarch64::WRegister& result) +{ + SwitchToFarCode(true, vs); + + BackupHostState(); + + // toss the result + ClearHostReg(result.GetCode()); + + EndBlockWithException(Exception::Ov); + + RestoreHostState(); + + SwitchToNearCode(false); +} + +void CPU::NewRec::AArch64Compiler::Compile_dst_op(CompileFlags cf, + void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, + const vixl::aarch64::Register&, + const vixl::aarch64::Operand&), + bool commutative, bool logical, bool overflow) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + const WRegister rd = CFGetRegD(cf); + if (cf.valid_host_s && cf.valid_host_t) + { + (armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf)); + } + else if (commutative && (cf.const_s || cf.const_t)) + { + const WRegister src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf); + if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0) + { + (armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv)); + } + else + { + if (rd.GetCode() != src.GetCode()) + armAsm->mov(rd, src); + overflow = false; + } + } + else if (cf.const_s) + { + // TODO: Check where we can use wzr here + EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS())); + (armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf)); + } + else if (cf.const_t) + { + const WRegister rs = CFGetRegS(cf); + if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0) + { + (armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv)); + } + else + { + if (rd.GetCode() != rs.GetCode()) + armAsm->mov(rd, rs); + overflow = false; + } + } + + if (overflow) + TestOverflow(rd); +} + +void CPU::NewRec::AArch64Compiler::Compile_add(CompileFlags cf) +{ + if (g_settings.cpu_recompiler_memory_exceptions) + Compile_dst_op(cf, &Assembler::adds, true, false, true); + else + Compile_dst_op(cf, &Assembler::add, true, false, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_addu(CompileFlags cf) +{ + Compile_dst_op(cf, &Assembler::add, true, false, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_sub(CompileFlags cf) +{ + if (g_settings.cpu_recompiler_memory_exceptions) + Compile_dst_op(cf, &Assembler::subs, false, false, true); + else + Compile_dst_op(cf, &Assembler::sub, false, false, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_subu(CompileFlags cf) +{ + Compile_dst_op(cf, &Assembler::sub, false, false, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_and(CompileFlags cf) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + // special cases - and with self -> self, and with 0 -> 0 + const WRegister regd = CFGetRegD(cf); + if (cf.MipsS() == cf.MipsT()) + { + armAsm->mov(regd, CFGetRegS(cf)); + return; + } + else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) + { + armAsm->mov(regd, wzr); + return; + } + + Compile_dst_op(cf, &Assembler::and_, true, true, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_or(CompileFlags cf) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + // or/nor with 0 -> no effect + const WRegister regd = CFGetRegD(cf); + if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT()) + { + cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); + return; + } + + Compile_dst_op(cf, &Assembler::orr, true, true, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_xor(CompileFlags cf) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + const WRegister regd = CFGetRegD(cf); + if (cf.MipsS() == cf.MipsT()) + { + // xor with self -> zero + armAsm->mov(regd, wzr); + return; + } + else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) + { + // xor with zero -> no effect + cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); + return; + } + + Compile_dst_op(cf, &Assembler::eor, true, true, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_nor(CompileFlags cf) +{ + Compile_or(cf); + armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf)); +} + +void CPU::NewRec::AArch64Compiler::Compile_slt(CompileFlags cf) +{ + Compile_slt(cf, true); +} + +void CPU::NewRec::AArch64Compiler::Compile_sltu(CompileFlags cf) +{ + Compile_slt(cf, false); +} + +void CPU::NewRec::AArch64Compiler::Compile_slt(CompileFlags cf, bool sign) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + // TODO: swap and reverse op for constants + if (cf.const_s) + { + EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS())); + armAsm->cmp(RWSCRATCH, CFGetRegT(cf)); + } + else if (cf.const_t) + { + armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT()))); + } + else + { + armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf)); + } + + armAsm->cset(CFGetRegD(cf), sign ? lt : lo); +} + +vixl::aarch64::WRegister +CPU::NewRec::AArch64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf, + const std::optional& address, + const std::optional& reg) +{ + const u32 imm = inst->i.imm_sext32(); + if (cf.valid_host_s && imm == 0 && !reg.has_value()) + return CFGetRegS(cf); + + const WRegister dst = reg.has_value() ? reg.value() : RWARG1; + if (address.has_value()) + { + EmitMov(dst, address.value()); + } + else if (imm == 0) + { + if (cf.valid_host_s) + { + if (const WRegister src = CFGetRegS(cf); src.GetCode() != dst.GetCode()) + armAsm->mov(dst, CFGetRegS(cf)); + } + else + { + armAsm->ldr(dst, MipsPtr(cf.MipsS())); + } + } + else + { + if (cf.valid_host_s) + { + armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast(inst->i.imm_sext32()))); + } + else + { + armAsm->ldr(dst, MipsPtr(cf.MipsS())); + armAsm->add(dst, dst, armCheckAddSubConstant(static_cast(inst->i.imm_sext32()))); + } + } + + return dst; +} + +template +vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::GenerateLoad(const vixl::aarch64::WRegister& addr_reg, + MemoryAccessSize size, bool sign, + const RegAllocFn& dst_reg_alloc) +{ + const bool checked = g_settings.cpu_recompiler_memory_exceptions; + if (!checked && CodeCache::IsUsingFastmem()) + { + m_cycles += Bus::RAM_READ_TICKS; + + const WRegister dst = dst_reg_alloc(); + + if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) + { + DebugAssert(addr_reg.GetCode() != RWARG3.GetCode()); + armAsm->lsr(RWARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT); + armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 8)); + } + + const MemOperand mem = + MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X()); + u8* start = m_emitter->GetCursorAddress(); + switch (size) + { + case MemoryAccessSize::Byte: + sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem); + break; + + case MemoryAccessSize::HalfWord: + sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem); + break; + + case MemoryAccessSize::Word: + armAsm->ldr(dst, mem); + break; + } + + AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true); + return dst; + } + + if (addr_reg.GetCode() != RWARG1.GetCode()) + armAsm->mov(RWARG1, addr_reg); + + switch (size) + { + case MemoryAccessSize::Byte: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte)); + } + break; + case MemoryAccessSize::HalfWord: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord)); + } + break; + case MemoryAccessSize::Word: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord)); + } + break; + } + + // TODO: turn this into an asm function instead + if (checked) + { + SwitchToFarCodeIfBitSet(RXRET, 63); + BackupHostState(); + + // Need to stash this in a temp because of the flush. + const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)); + armAsm->neg(temp.X(), RXRET); + armAsm->lsl(temp, temp, 2); + + Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); + + // cause_bits = (-result << 2) | BD | cop_n + armAsm->orr(RWARG1, temp, + armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException( + static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n))); + EmitMov(RWARG2, m_current_instruction_pc); + EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException))); + FreeHostReg(temp.GetCode()); + EndBlock(std::nullopt, true); + + RestoreHostState(); + SwitchToNearCode(false); + } + + const WRegister dst_reg = dst_reg_alloc(); + switch (size) + { + case MemoryAccessSize::Byte: + { + sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET); + } + break; + case MemoryAccessSize::HalfWord: + { + sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET); + } + break; + case MemoryAccessSize::Word: + { + if (dst_reg.GetCode() != RWRET.GetCode()) + armAsm->mov(dst_reg, RWRET); + } + break; + } + + return dst_reg; +} + +void CPU::NewRec::AArch64Compiler::GenerateStore(const vixl::aarch64::WRegister& addr_reg, + const vixl::aarch64::WRegister& value_reg, MemoryAccessSize size) +{ + const bool checked = g_settings.cpu_recompiler_memory_exceptions; + if (!checked && CodeCache::IsUsingFastmem()) + { + if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) + { + DebugAssert(addr_reg.GetCode() != RWARG3.GetCode()); + armAsm->lsr(RWARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT); + armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 8)); + } + + const MemOperand mem = + MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X()); + u8* start = m_emitter->GetCursorAddress(); + switch (size) + { + case MemoryAccessSize::Byte: + armAsm->strb(value_reg, mem); + break; + + case MemoryAccessSize::HalfWord: + armAsm->strh(value_reg, mem); + break; + + case MemoryAccessSize::Word: + armAsm->str(value_reg, mem); + break; + } + AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false); + return; + } + + if (addr_reg.GetCode() != RWARG1.GetCode()) + armAsm->mov(RWARG1, addr_reg); + if (value_reg.GetCode() != RWARG2.GetCode()) + armAsm->mov(RWARG2, value_reg); + + switch (size) + { + case MemoryAccessSize::Byte: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte)); + } + break; + case MemoryAccessSize::HalfWord: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); + } + break; + case MemoryAccessSize::Word: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord)); + } + break; + } + + // TODO: turn this into an asm function instead + if (checked) + { + SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true); + BackupHostState(); + + // Need to stash this in a temp because of the flush. + const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)); + armAsm->lsl(temp, RWRET, 2); + + Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); + + // cause_bits = (result << 2) | BD | cop_n + armAsm->orr(RWARG1, temp, + armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException( + static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n))); + EmitMov(RWARG2, m_current_instruction_pc); + EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException))); + FreeHostReg(temp.GetCode()); + EndBlock(std::nullopt, true); + + RestoreHostState(); + SwitchToNearCode(false); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + const std::optional addr_reg = + g_settings.gpu_pgxp_enable ? std::optional(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) : + std::optional(); + FlushForLoadStore(address, false); + const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); + const WRegister data = GenerateLoad(addr, size, sign, [this, cf]() { + if (cf.MipsT() == Reg::zero) + return RWRET; + + return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(), + EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, + cf.MipsT())); + }); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + + EmitMov(RWARG1, inst->bits); + armAsm->mov(RWARG2, addr); + armAsm->mov(RWARG3, data); + EmitCall(s_pgxp_mem_load_functions[static_cast(size)][static_cast(sign)]); + FreeHostReg(addr_reg.value().GetCode()); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + DebugAssert(size == MemoryAccessSize::Word && !sign); + FlushForLoadStore(address, false); + + // TODO: if address is constant, this can be simplified.. + + // If we're coming from another block, just flush the load delay and hope for the best.. + if (m_load_delay_dirty) + UpdateLoadDelay(); + + // We'd need to be careful here if we weren't overwriting it.. + const WRegister addr = WRegister(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP)); + ComputeLoadStoreAddressArg(cf, address, addr); + armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u)); + GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; }); + + if (inst->r.rt == Reg::zero) + { + FreeHostReg(addr.GetCode()); + return; + } + + // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is + // never written back. NOTE: can't trust T in cf because of the flush + const Reg rt = inst->r.rt; + WRegister value; + if (m_load_delay_register == rt) + { + const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ? + AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) : + m_load_delay_value_register; + RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt); + value = WRegister(existing_ld_rt); + } + else + { + if constexpr (EMULATE_LOAD_DELAYS) + { + value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt)); + if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) + armAsm->mov(value, WRegister(rtreg.value())); + else if (HasConstantReg(rt)) + EmitMov(value, GetConstantRegU32(rt)); + else + armAsm->ldr(value, MipsPtr(rt)); + } + else + { + value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt)); + } + } + + DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode()); + armAsm->and_(RWARG2, addr, 3); + armAsm->lsl(RWARG2, RWARG2, 3); // *8 + EmitMov(RWARG3, 24); + armAsm->sub(RWARG3, RWARG3, RWARG2); + + if (inst->op == InstructionOp::lwl) + { + // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; + // new_value = (value & mask) | (RWRET << (24 - shift)); + EmitMov(addr, 0xFFFFFFu); + armAsm->lsrv(addr, addr, RWARG2); + armAsm->and_(value, value, addr); + armAsm->lslv(RWRET, RWRET, RWARG3); + armAsm->orr(value, value, RWRET); + } + else + { + // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); + // new_value = (value & mask) | (RWRET >> shift); + armAsm->lsrv(RWRET, RWRET, RWARG2); + EmitMov(addr, 0xFFFFFF00u); + armAsm->lslv(addr, addr, RWARG3); + armAsm->and_(value, value, addr); + armAsm->orr(value, value, RWRET); + } + + FreeHostReg(addr.GetCode()); +} + +void CPU::NewRec::AArch64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + const std::optional addr_reg = + g_settings.gpu_pgxp_enable ? std::optional(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) : + std::optional(); + FlushForLoadStore(address, false); + const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); + GenerateLoad(addr, MemoryAccessSize::Word, false, []() { return RWRET; }); + + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, true); + switch (action) + { + case GTERegisterAccessAction::Ignore: + { + break; + } + + case GTERegisterAccessAction::Direct: + { + armAsm->str(RWRET, PTR(ptr)); + break; + } + + case GTERegisterAccessAction::SignExtend16: + { + armAsm->sxth(RWRET, RWRET); + armAsm->str(RWRET, PTR(ptr)); + break; + } + + case GTERegisterAccessAction::ZeroExtend16: + { + armAsm->uxth(RWRET, RWRET); + armAsm->str(RWRET, PTR(ptr)); + break; + } + + case GTERegisterAccessAction::CallHandler: + { + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RWARG2, RWRET); + EmitMov(RWARG1, index); + EmitCall(reinterpret_cast(>E::WriteRegister)); + break; + } + + case GTERegisterAccessAction::PushFIFO: + { + // SXY0 <- SXY1 + // SXY1 <- SXY2 + // SXY2 <- SXYP + DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode()); + armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0])); + armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0])); + armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0])); + armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0])); + armAsm->str(RWRET, PTR(&g_state.gte_regs.SXY2[0])); + break; + } + + default: + { + Panic("Unknown action"); + return; + } + } + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RWARG3, RWRET); + armAsm->mov(RWARG2, addr); + EmitMov(RWARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_LWC2)); + FreeHostReg(addr_reg.value().GetCode()); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + const std::optional addr_reg = + g_settings.gpu_pgxp_enable ? std::optional(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) : + std::optional(); + FlushForLoadStore(address, true); + const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); + const WRegister data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2; + if (!cf.valid_host_t) + MoveTToReg(RWARG2, cf); + + GenerateStore(addr, data, size); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + MoveMIPSRegToReg(RWARG3, cf.MipsT()); + armAsm->mov(RWARG2, addr); + EmitMov(RWARG1, inst->bits); + EmitCall(s_pgxp_mem_store_functions[static_cast(size)]); + FreeHostReg(addr_reg.value().GetCode()); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + DebugAssert(size == MemoryAccessSize::Word && !sign); + FlushForLoadStore(address, true); + + // TODO: if address is constant, this can be simplified.. + // We'd need to be careful here if we weren't overwriting it.. + const WRegister addr = WRegister(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP)); + ComputeLoadStoreAddressArg(cf, address, addr); + armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u)); + GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; }); + + // TODO: this can take over rt's value if it's no longer needed + // NOTE: can't trust T in cf because of the flush + const Reg rt = inst->r.rt; + const WRegister value = RWARG2; + if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) + armAsm->mov(value, WRegister(rtreg.value())); + else if (HasConstantReg(rt)) + EmitMov(value, GetConstantRegU32(rt)); + else + armAsm->ldr(value, MipsPtr(rt)); + + armAsm->and_(RWSCRATCH, addr, 3); + armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8 + + if (inst->op == InstructionOp::swl) + { + // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; + // new_value = (RWRET & mem_mask) | (value >> (24 - shift)); + EmitMov(RWARG3, 0xFFFFFF00u); + armAsm->lslv(RWARG3, RWARG3, RWSCRATCH); + armAsm->and_(RWRET, RWRET, RWARG3); + + EmitMov(RWARG3, 24); + armAsm->sub(RWARG3, RWARG3, RWSCRATCH); + armAsm->lsrv(value, value, RWARG3); + armAsm->orr(value, value, RWRET); + } + else + { + // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); + // new_value = (RWRET & mem_mask) | (value << shift); + armAsm->lslv(value, value, RWSCRATCH); + + EmitMov(RWARG3, 24); + armAsm->sub(RWARG3, RWARG3, RWSCRATCH); + EmitMov(RWSCRATCH, 0x00FFFFFFu); + armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3); + armAsm->and_(RWRET, RWRET, RWSCRATCH); + armAsm->orr(value, value, RWRET); + } + + FreeHostReg(addr.GetCode()); + + armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u)); + GenerateStore(RWARG1, value, MemoryAccessSize::Word); +} + +void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + FlushForLoadStore(address, true); + + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, false); + switch (action) + { + case GTERegisterAccessAction::Direct: + { + armAsm->ldr(RWARG2, PTR(ptr)); + } + break; + + case GTERegisterAccessAction::CallHandler: + { + // should already be flushed.. except in fastmem case + Flush(FLUSH_FOR_C_CALL); + EmitMov(RWARG1, index); + EmitCall(reinterpret_cast(>E::ReadRegister)); + armAsm->mov(RWARG2, RWRET); + } + break; + + default: + { + Panic("Unknown action"); + } + break; + } + + // PGXP makes this a giant pain. + if (!g_settings.gpu_pgxp_enable) + { + const WRegister addr = ComputeLoadStoreAddressArg(cf, address); + GenerateStore(addr, RWARG2, size); + return; + } + + // TODO: This can be simplified because we don't need to validate in PGXP.. + const WRegister addr_reg = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)); + const WRegister data_backup = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)); + FlushForLoadStore(address, true); + ComputeLoadStoreAddressArg(cf, address, addr_reg); + armAsm->mov(data_backup, RWARG2); + GenerateStore(addr_reg, RWARG2, size); + + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RWARG3, data_backup); + armAsm->mov(RWARG2, addr_reg); + EmitMov(RWARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); + FreeHostReg(addr_reg.GetCode()); + FreeHostReg(data_backup.GetCode()); +} + +void CPU::NewRec::AArch64Compiler::Compile_mtc0(CompileFlags cf) +{ + // TODO: we need better constant setting here.. which will need backprop + AssertRegOrConstT(cf); + + const Cop0Reg reg = static_cast(MipsD()); + const u32* ptr = GetCop0RegPtr(reg); + const u32 mask = GetCop0RegWriteMask(reg); + if (!ptr) + { + Compile_Fallback(); + return; + } + + if (mask == 0) + { + // if it's a read-only register, ignore + Log_DebugPrintf("Ignoring write to read-only cop0 reg %u", static_cast(reg)); + return; + } + + // for some registers, we need to test certain bits + const bool needs_bit_test = (reg == Cop0Reg::SR); + const WRegister new_value = RWARG1; + const WRegister old_value = RWARG2; + const WRegister changed_bits = RWARG3; + const WRegister mask_reg = RWSCRATCH; + + // Load old value + armAsm->ldr(old_value, PTR(ptr)); + + // No way we fit this in an immediate.. + EmitMov(mask_reg, mask); + + // update value + if (cf.valid_host_t) + armAsm->and_(new_value, CFGetRegT(cf), mask_reg); + else + EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask); + + if (needs_bit_test) + armAsm->eor(changed_bits, old_value, new_value); + armAsm->bic(old_value, old_value, mask_reg); + armAsm->orr(new_value, old_value, new_value); + armAsm->str(new_value, PTR(ptr)); + + if (reg == Cop0Reg::SR) + { + // TODO: replace with register backup + // We could just inline the whole thing.. + Flush(FLUSH_FOR_C_CALL); + + SwitchToFarCodeIfBitSet(changed_bits, 16); + armAsm->sub(sp, sp, 16); + armAsm->stp(RWARG1, RWARG2, MemOperand(sp)); + EmitCall(reinterpret_cast(&CPU::UpdateMemoryPointers)); + armAsm->ldp(RWARG1, RWARG2, MemOperand(sp)); + armAsm->add(sp, sp, 16); + armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base)); + SwitchToNearCode(true); + } + + if (reg == Cop0Reg::SR || reg == Cop0Reg::CAUSE) + { + const WRegister sr = (reg == Cop0Reg::SR) ? RWARG2 : (armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits)), RWARG1); + TestInterrupts(sr); + } + + if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions) + { + // TODO: DCIC handling for debug breakpoints + Log_WarningPrintf("TODO: DCIC handling for debug breakpoints"); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_rfe(CompileFlags cf) +{ + // shift mode bits right two, preserving upper bits + armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits)); + armAsm->bfxil(RWARG1, RWARG1, 2, 4); + armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits)); + + TestInterrupts(RWARG1); +} + +void CPU::NewRec::AArch64Compiler::TestInterrupts(const vixl::aarch64::WRegister& sr) +{ + // if Iec == 0 then goto no_interrupt + Label no_interrupt; + armAsm->tbz(sr, 0, &no_interrupt); + + // sr & cause + armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits)); + armAsm->and_(sr, sr, RWSCRATCH); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + armAsm->tst(sr, 0xFF00); + + SwitchToFarCode(true, ne); + BackupHostState(); + Flush(FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL); + EmitCall(reinterpret_cast(&DispatchInterrupt)); + EndBlock(std::nullopt, true); + RestoreHostState(); + SwitchToNearCode(false); + + armAsm->bind(&no_interrupt); +} + +void CPU::NewRec::AArch64Compiler::Compile_mfc2(CompileFlags cf) +{ + const u32 index = inst->cop.Cop2Index(); + const Reg rt = inst->r.rt; + + const auto [ptr, action] = GetGTERegisterPointer(index, false); + if (action == GTERegisterAccessAction::Ignore) + return; + + u32 hreg; + if (action == GTERegisterAccessAction::Direct) + { + hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(), + EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); + armAsm->ldr(WRegister(hreg), PTR(ptr)); + } + else if (action == GTERegisterAccessAction::CallHandler) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RWARG1, index); + EmitCall(reinterpret_cast(>E::ReadRegister)); + + hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(), + EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); + armAsm->mov(WRegister(hreg), RWRET); + } + else + { + Panic("Unknown action"); + return; + } + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RWARG1, inst->bits); + armAsm->mov(RWARG2, WRegister(hreg)); + EmitCall(reinterpret_cast(&PGXP::CPU_MFC2)); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_mtc2(CompileFlags cf) +{ + const u32 index = inst->cop.Cop2Index(); + const auto [ptr, action] = GetGTERegisterPointer(index, true); + if (action == GTERegisterAccessAction::Ignore) + return; + + if (action == GTERegisterAccessAction::Direct) + { + if (cf.const_t) + StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr); + else + armAsm->str(CFGetRegT(cf), PTR(ptr)); + } + else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16) + { + const bool sign = (action == GTERegisterAccessAction::SignExtend16); + if (cf.valid_host_t) + { + sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf)); + armAsm->str(RWARG1, PTR(ptr)); + } + else if (cf.const_t) + { + const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT())); + StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr); + } + else + { + Panic("Unsupported setup"); + } + } + else if (action == GTERegisterAccessAction::CallHandler) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RWARG1, index); + MoveTToReg(RWARG2, cf); + EmitCall(reinterpret_cast(>E::WriteRegister)); + } + else if (action == GTERegisterAccessAction::PushFIFO) + { + // SXY0 <- SXY1 + // SXY1 <- SXY2 + // SXY2 <- SXYP + DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode()); + armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0])); + armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0])); + armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0])); + armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0])); + if (cf.valid_host_t) + armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0])); + else if (cf.const_t) + StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]); + else + Panic("Unsupported setup"); + } + else + { + Panic("Unknown action"); + } +} + +void CPU::NewRec::AArch64Compiler::Compile_cop2(CompileFlags cf) +{ + TickCount func_ticks; + GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks); + + Flush(FLUSH_FOR_C_CALL); + EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK); + EmitCall(reinterpret_cast(func)); + + AddGTETicks(func_ticks); +} + +u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size, + TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask, + u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed, + bool is_load) +{ + Assembler arm_asm(static_cast(thunk_code), thunk_space); + Assembler* armAsm = &arm_asm; + +#ifdef VIXL_DEBUG + vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace); +#endif + + static constexpr u32 GPR_SIZE = 8; + + // save regs + u32 num_gprs = 0; + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i)) + num_gprs++; + } + + const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE); + + // TODO: use stp+ldp, vixl helper? + + if (stack_size > 0) + { + armAsm->sub(sp, sp, stack_size); + + u32 stack_offset = 0; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i)) + { + armAsm->str(XRegister(i), MemOperand(sp, stack_offset)); + stack_offset += GPR_SIZE; + } + } + } + + if (cycles_to_add != 0) + { + // NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles + Assert(Assembler::IsImmAddSub(cycles_to_add)); + armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks)); + armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add); + armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks)); + } + + if (address_register != static_cast(RWARG1.GetCode())) + armAsm->mov(RWARG1, WRegister(address_register)); + + if (!is_load) + { + if (data_register != static_cast(RWARG2.GetCode())) + armAsm->mov(RWARG2, WRegister(data_register)); + } + + switch (size) + { + case MemoryAccessSize::Byte: + { + armEmitCall(armAsm, + is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte), + false); + } + break; + case MemoryAccessSize::HalfWord: + { + armEmitCall(armAsm, + is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord), + false); + } + break; + case MemoryAccessSize::Word: + { + armEmitCall(armAsm, + is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord), + false); + } + break; + } + + if (is_load) + { + const WRegister dst = WRegister(data_register); + switch (size) + { + case MemoryAccessSize::Byte: + { + is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET); + } + break; + case MemoryAccessSize::HalfWord: + { + is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET); + } + break; + case MemoryAccessSize::Word: + { + if (dst.GetCode() != RWRET.GetCode()) + armAsm->mov(dst, RWRET); + } + break; + } + } + + if (cycles_to_remove != 0) + { + Assert(Assembler::IsImmAddSub(cycles_to_remove)); + armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks)); + armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove); + armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks)); + } + + // restore regs + if (stack_size > 0) + { + u32 stack_offset = 0; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i)) + { + armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset)); + stack_offset += GPR_SIZE; + } + } + + armAsm->add(sp, sp, stack_size); + } + + armEmitJmp(armAsm, static_cast(code_address) + code_size, true); + armAsm->FinalizeCode(); + + return static_cast(armAsm->GetCursorOffset()); +} diff --git a/src/core/cpu_newrec_compiler_aarch64.h b/src/core/cpu_newrec_compiler_aarch64.h new file mode 100644 index 000000000..58c6b0a71 --- /dev/null +++ b/src/core/cpu_newrec_compiler_aarch64.h @@ -0,0 +1,164 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once +#include "cpu_newrec_compiler.h" +#include + +#include "vixl/aarch64/assembler-aarch64.h" + +namespace CPU::NewRec { + +class AArch64Compiler final : public Compiler +{ +public: + AArch64Compiler(); + ~AArch64Compiler() override; + +protected: + const char* GetHostRegName(u32 reg) const override; + + const void* GetCurrentCodePointer() override; + + void LoadHostRegWithConstant(u32 reg, u32 val) override; + void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) override; + void StoreConstantToCPUPointer(u32 val, const void* ptr) override; + void StoreHostRegToCPUPointer(u32 reg, const void* ptr) override; + void CopyHostReg(u32 dst, u32 src) override; + + void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer, u32 far_code_space) override; + void BeginBlock() override; + void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) override; + void GenerateICacheCheckAndUpdate() override; + void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) override; + void EndBlock(const std::optional& newpc, bool do_event_test) override; + void EndBlockWithException(Exception excode) override; + void EndAndLinkBlock(const std::optional& newpc, bool do_event_test); + const void* EndCompile(u32* code_size, u32* far_code_size) override; + + void Flush(u32 flags) override; + + void Compile_Fallback() override; + + void CheckBranchTarget(const vixl::aarch64::WRegister& pcreg); + void Compile_jr(CompileFlags cf) override; + void Compile_jalr(CompileFlags cf) override; + void Compile_bxx(CompileFlags cf, BranchCondition cond) override; + + void Compile_addi(CompileFlags cf, bool overflow); + void Compile_addi(CompileFlags cf) override; + void Compile_addiu(CompileFlags cf) override; + void Compile_slti(CompileFlags cf, bool sign); + void Compile_slti(CompileFlags cf) override; + void Compile_sltiu(CompileFlags cf) override; + void Compile_andi(CompileFlags cf) override; + void Compile_ori(CompileFlags cf) override; + void Compile_xori(CompileFlags cf) override; + + void Compile_shift(CompileFlags cf, void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, + const vixl::aarch64::Register&, unsigned)); + void Compile_sll(CompileFlags cf) override; + void Compile_srl(CompileFlags cf) override; + void Compile_sra(CompileFlags cf) override; + void Compile_variable_shift(CompileFlags cf, + void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, + const vixl::aarch64::Register&, + const vixl::aarch64::Register&), + void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, + const vixl::aarch64::Register&, unsigned)); + void Compile_sllv(CompileFlags cf) override; + void Compile_srlv(CompileFlags cf) override; + void Compile_srav(CompileFlags cf) override; + void Compile_mult(CompileFlags cf, bool sign); + void Compile_mult(CompileFlags cf) override; + void Compile_multu(CompileFlags cf) override; + void Compile_div(CompileFlags cf) override; + void Compile_divu(CompileFlags cf) override; + void TestOverflow(const vixl::aarch64::WRegister& result); + void Compile_dst_op(CompileFlags cf, + void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, + const vixl::aarch64::Register&, + const vixl::aarch64::Operand&), + bool commutative, bool logical, bool overflow); + void Compile_add(CompileFlags cf) override; + void Compile_addu(CompileFlags cf) override; + void Compile_sub(CompileFlags cf) override; + void Compile_subu(CompileFlags cf) override; + void Compile_and(CompileFlags cf) override; + void Compile_or(CompileFlags cf) override; + void Compile_xor(CompileFlags cf) override; + void Compile_nor(CompileFlags cf) override; + void Compile_slt(CompileFlags cf, bool sign); + void Compile_slt(CompileFlags cf) override; + void Compile_sltu(CompileFlags cf) override; + + vixl::aarch64::WRegister + ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional& address, + const std::optional& reg = std::nullopt); + template + vixl::aarch64::WRegister GenerateLoad(const vixl::aarch64::WRegister& addr_reg, MemoryAccessSize size, bool sign, + const RegAllocFn& dst_reg_alloc); + void GenerateStore(const vixl::aarch64::WRegister& addr_reg, const vixl::aarch64::WRegister& value_reg, + MemoryAccessSize size); + void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + + void TestInterrupts(const vixl::aarch64::WRegister& sr); + void Compile_mtc0(CompileFlags cf) override; + void Compile_rfe(CompileFlags cf) override; + + void Compile_mfc2(CompileFlags cf) override; + void Compile_mtc2(CompileFlags cf) override; + void Compile_cop2(CompileFlags cf) override; + + void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count, + Reg arg3reg = Reg::count) override; + +private: + void EmitMov(const vixl::aarch64::WRegister& dst, u32 val); + void EmitCall(const void* ptr, bool force_inline = false); + + vixl::aarch64::Operand armCheckAddSubConstant(s32 val); + vixl::aarch64::Operand armCheckAddSubConstant(u32 val); + vixl::aarch64::Operand armCheckCompareConstant(s32 val); + vixl::aarch64::Operand armCheckLogicalConstant(u32 val); + + void SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond = vixl::aarch64::Condition::al); + void SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit); + void SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero); + void SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond = vixl::aarch64::Condition::al); + + void AssertRegOrConstS(CompileFlags cf) const; + void AssertRegOrConstT(CompileFlags cf) const; + vixl::aarch64::MemOperand MipsPtr(Reg r) const; + vixl::aarch64::WRegister CFGetRegD(CompileFlags cf) const; + vixl::aarch64::WRegister CFGetRegS(CompileFlags cf) const; + vixl::aarch64::WRegister CFGetRegT(CompileFlags cf) const; + vixl::aarch64::WRegister CFGetRegLO(CompileFlags cf) const; + vixl::aarch64::WRegister CFGetRegHI(CompileFlags cf) const; + + void MoveSToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf); + void MoveTToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf); + void MoveMIPSRegToReg(const vixl::aarch64::WRegister& dst, Reg reg); + + std::unique_ptr m_emitter; + std::unique_ptr m_far_emitter; + vixl::aarch64::Assembler* armAsm; + +#ifdef VIXL_DEBUG + std::unique_ptr m_emitter_check; + std::unique_ptr m_far_emitter_check; +#endif +}; + +} // namespace CPU::NewRec diff --git a/src/core/cpu_newrec_compiler_riscv64.cpp b/src/core/cpu_newrec_compiler_riscv64.cpp new file mode 100644 index 000000000..88ad4783e --- /dev/null +++ b/src/core/cpu_newrec_compiler_riscv64.cpp @@ -0,0 +1,2453 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#include "cpu_newrec_compiler_riscv64.h" +#include "common/align.h" +#include "common/assert.h" +#include "common/log.h" +#include "common/string_util.h" +#include "cpu_code_cache_private.h" +#include "cpu_core_private.h" +#include "cpu_recompiler_thunks.h" +#include "gte.h" +#include "settings.h" +#include "timing_event.h" +#include +Log_SetChannel(CPU::NewRec); + +#ifdef ENABLE_HOST_DISASSEMBLY +extern "C" { +#include "riscv-disas.h" +} +#endif + +// For LW/SW/etc. +#define PTR(x) ((u32)(((u8*)(x)) - ((u8*)&g_state))), RSTATE + +static constexpr u32 BLOCK_LINK_SIZE = 8; // auipc+jr + +namespace CPU::NewRec { + +using namespace biscuit; + +using CPU::Recompiler::rvEmitCall; +using CPU::Recompiler::rvEmitDSExtW; +using CPU::Recompiler::rvEmitDUExtW; +using CPU::Recompiler::rvEmitJmp; +using CPU::Recompiler::rvEmitMov; +using CPU::Recompiler::rvEmitMov64; +using CPU::Recompiler::rvEmitSExtB; +using CPU::Recompiler::rvEmitSExtH; +using CPU::Recompiler::rvEmitUExtB; +using CPU::Recompiler::rvEmitUExtH; +using CPU::Recompiler::rvGetAddressImmediates; +using CPU::Recompiler::rvIsCallerSavedRegister; +using CPU::Recompiler::rvIsValidSExtITypeImm; +using CPU::Recompiler::rvMoveAddressToReg; + +RISCV64Compiler s_instance; +Compiler* g_compiler = &s_instance; + +} // namespace CPU::NewRec + +bool CPU::Recompiler::rvIsCallerSavedRegister(u32 id) +{ + return (id == 1 || (id >= 3 && id < 8) || (id >= 10 && id <= 17) || (id >= 28 && id <= 31)); +} + +bool CPU::Recompiler::rvIsValidSExtITypeImm(u32 imm) +{ + return (static_cast((static_cast(imm) << 20) >> 20) == imm); +} + +std::pair CPU::Recompiler::rvGetAddressImmediates(const void* cur, const void* target) +{ + const s64 disp = static_cast(reinterpret_cast(target) - reinterpret_cast(cur)); + Assert(disp >= static_cast(std::numeric_limits::min()) && + disp <= static_cast(std::numeric_limits::max())); + + const s64 hi = disp + 0x800; + const s64 lo = disp - (hi & 0xFFFFF000); + return std::make_pair(static_cast(hi >> 12), static_cast((lo << 52) >> 52)); +} + +void CPU::Recompiler::rvMoveAddressToReg(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr) +{ + const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), addr); + rvAsm->AUIPC(reg, hi); + rvAsm->ADDI(reg, reg, lo); +} + +void CPU::Recompiler::rvEmitMov(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, u32 imm) +{ + // Borrowed from biscuit, but doesn't emit an ADDI if the lower 12 bits are zero. + const u32 lower = imm & 0xFFF; + const u32 upper = (imm & 0xFFFFF000) >> 12; + const s32 simm = static_cast(imm); + if (rvIsValidSExtITypeImm(simm)) + { + rvAsm->ADDI(rd, biscuit::zero, static_cast(lower)); + } + else + { + const bool needs_increment = (lower & 0x800) != 0; + const u32 upper_imm = needs_increment ? upper + 1 : upper; + rvAsm->LUI(rd, upper_imm); + rvAsm->ADDI(rd, rd, static_cast(lower)); + } +} + +void CPU::Recompiler::rvEmitMov64(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& scratch, + u64 imm) +{ + // TODO: Make better.. + rvEmitMov(rvAsm, rd, static_cast(imm >> 32)); + rvEmitMov(rvAsm, scratch, static_cast(imm)); + rvAsm->SLLI64(rd, rd, 32); + rvAsm->SLLI64(scratch, scratch, 32); + rvAsm->SRLI64(scratch, scratch, 32); + rvAsm->ADD(rd, rd, scratch); +} + +u32 CPU::Recompiler::rvEmitJmp(biscuit::Assembler* rvAsm, const void* ptr, const biscuit::GPR& link_reg) +{ + // TODO: use J if displacement is <1MB, needs a bool because backpatch must be 8 bytes + const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), ptr); + rvAsm->AUIPC(RSCRATCH, hi); + rvAsm->JALR(link_reg, lo, RSCRATCH); + return 8; +} + +u32 CPU::Recompiler::rvEmitCall(biscuit::Assembler* rvAsm, const void* ptr) +{ + return rvEmitJmp(rvAsm, ptr, biscuit::ra); +} + +void CPU::Recompiler::rvEmitSExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvAsm->SLLI(rd, rs, 24); + rvAsm->SRAIW(rd, rd, 24); +} + +void CPU::Recompiler::rvEmitUExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvAsm->ANDI(rd, rs, 0xFF); +} + +void CPU::Recompiler::rvEmitSExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvAsm->SLLI(rd, rs, 16); + rvAsm->SRAIW(rd, rd, 16); +} + +void CPU::Recompiler::rvEmitUExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvAsm->SLLI(rd, rs, 16); + rvAsm->SRLI(rd, rd, 16); +} + +void CPU::Recompiler::rvEmitDSExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvAsm->ADDIW(rd, rs, 0); +} + +void CPU::Recompiler::rvEmitDUExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvAsm->SLLI64(rd, rs, 32); + rvAsm->SRLI64(rd, rd, 32); +} + +void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size) +{ +#ifdef ENABLE_HOST_DISASSEMBLY + const u8* cur = static_cast(start); + const u8* end = cur + size; + char buf[256]; + while (cur < end) + { + rv_inst inst; + size_t instlen; + inst_fetch(cur, &inst, &instlen); + disasm_inst(buf, std::size(buf), rv64, static_cast(reinterpret_cast(cur)), inst); + Log_DebugPrintf("\t0x%016" PRIx64 "\t%s", static_cast(reinterpret_cast(cur)), buf); + cur += instlen; + } +#else + Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY."); +#endif +} + +u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size) +{ +#ifdef ENABLE_HOST_DISASSEMBLY + const u8* cur = static_cast(start); + const u8* end = cur + size; + u32 icount = 0; + while (cur < end) + { + rv_inst inst; + size_t instlen; + inst_fetch(cur, &inst, &instlen); + cur += instlen; + icount++; + } + return icount; +#else + Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY."); + return 0; +#endif +} + +u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) +{ + using namespace CPU::Recompiler; + using namespace biscuit; + + Assembler actual_asm(static_cast(code), code_size); + Assembler* rvAsm = &actual_asm; + + Label dispatch; + + g_enter_recompiler = reinterpret_cast(rvAsm->GetCursorPointer()); + { + // TODO: reserve some space for saving caller-saved registers + + // Need the CPU state for basically everything :-) + rvMoveAddressToReg(rvAsm, RSTATE, &g_state); + + // Fastmem setup + if (IsUsingFastmem()) + rvAsm->LD(RMEMBASE, PTR(&g_state.fastmem_base)); + + // Downcount isn't set on entry, so we need to initialize it + rvMoveAddressToReg(rvAsm, RARG1, TimingEvents::GetHeadEventPtr()); + rvAsm->LD(RARG1, 0, RARG1); + rvAsm->LW(RARG1, offsetof(TimingEvent, m_downcount), RARG1); + rvAsm->SW(RARG1, PTR(&g_state.downcount)); + + // Fall through to event dispatcher + } + + // check events then for frame done + g_check_events_and_dispatch = rvAsm->GetCursorPointer(); + { + Label skip_event_check; + rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); + rvAsm->LW(RARG2, PTR(&g_state.downcount)); + rvAsm->BLTU(RARG1, RARG2, &skip_event_check); + + g_run_events_and_dispatch = rvAsm->GetCursorPointer(); + rvEmitCall(rvAsm, reinterpret_cast(&TimingEvents::RunEvents)); + + rvAsm->Bind(&skip_event_check); + } + + // TODO: align? + g_dispatcher = rvAsm->GetCursorPointer(); + { + rvAsm->Bind(&dispatch); + + // x9 <- s_fast_map[pc >> 16] + rvAsm->LWU(RARG1, PTR(&g_state.pc)); + rvMoveAddressToReg(rvAsm, RARG3, g_code_lut.data()); + rvAsm->SRLI(RARG2, RARG1, 16); + rvAsm->SLLI(RARG1, RARG1, 1); + rvAsm->SLLI(RARG2, RARG2, 3); + rvAsm->ADD(RARG2, RARG2, RARG3); + rvAsm->LD(RARG2, 0, RARG2); + + // blr(x9[pc * 2]) (fast_map[pc >> 2]) + rvAsm->ADD(RARG1, RARG1, RARG2); + rvAsm->LD(RARG1, 0, RARG1); + rvAsm->JR(RARG1); + } + + g_compile_or_revalidate_block = rvAsm->GetCursorPointer(); + { + rvAsm->LW(RARG1, PTR(&g_state.pc)); + rvEmitCall(rvAsm, reinterpret_cast(&CompileOrRevalidateBlock)); + rvAsm->J(&dispatch); + } + + g_discard_and_recompile_block = rvAsm->GetCursorPointer(); + { + rvAsm->LW(RARG1, PTR(&g_state.pc)); + rvEmitCall(rvAsm, reinterpret_cast(&DiscardAndRecompileBlock)); + rvAsm->J(&dispatch); + } + + g_interpret_block = rvAsm->GetCursorPointer(); + { + rvEmitCall(rvAsm, CodeCache::GetInterpretUncachedBlockFunction()); + rvAsm->J(&dispatch); + } + + // TODO: align? + + return static_cast(rvAsm->GetCodeBuffer().GetSizeInBytes()); +} + +u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache) +{ + // TODO: get rid of assembler construction here + { + biscuit::Assembler assembler(static_cast(code), BLOCK_LINK_SIZE); + CPU::Recompiler::rvEmitCall(&assembler, dst); + + DebugAssert(assembler.GetCodeBuffer().GetSizeInBytes() <= BLOCK_LINK_SIZE); + if (assembler.GetCodeBuffer().GetRemainingBytes() > 0) + assembler.NOP(); + } + + if (flush_icache) + JitCodeBuffer::FlushInstructionCache(code, BLOCK_LINK_SIZE); + + return BLOCK_LINK_SIZE; +} + +CPU::NewRec::RISCV64Compiler::RISCV64Compiler() = default; + +CPU::NewRec::RISCV64Compiler::~RISCV64Compiler() = default; + +const void* CPU::NewRec::RISCV64Compiler::GetCurrentCodePointer() +{ + return rvAsm->GetCursorPointer(); +} + +void CPU::NewRec::RISCV64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, + u8* far_code_buffer, u32 far_code_space) +{ + Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space); + + // TODO: don't recreate this every time.. + DebugAssert(!m_emitter && !m_far_emitter && !rvAsm); + m_emitter = std::make_unique(code_buffer, code_buffer_space); + m_far_emitter = std::make_unique(far_code_buffer, far_code_space); + rvAsm = m_emitter.get(); + + // Need to wipe it out so it's correct when toggling fastmem. + m_host_regs = {}; + + const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.Index() : NUM_HOST_REGS; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& hra = m_host_regs[i]; + + if (i == RARG1.Index() || i == RARG2.Index() || i == RARG3.Index() || i == RSCRATCH.Index() || + i == RSTATE.Index() || i == membase_idx || i < 5 /* zero, ra, sp, gp, tp */) + { + continue; + } + + hra.flags = HR_USABLE | (rvIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED); + } +} + +void CPU::NewRec::RISCV64Compiler::SwitchToFarCode( + bool emit_jump, + void (biscuit::Assembler::*inverted_cond)(biscuit::GPR, biscuit::GPR, biscuit::Label*) /* = nullptr */, + const biscuit::GPR& rs1 /* = biscuit::zero */, const biscuit::GPR& rs2 /* = biscuit::zero */) +{ + DebugAssert(rvAsm == m_emitter.get()); + if (emit_jump) + { + const void* target = m_far_emitter->GetCursorPointer(); + if (inverted_cond) + { + Label skip; + (rvAsm->*inverted_cond)(rs1, rs2, &skip); + rvEmitJmp(rvAsm, target); + rvAsm->Bind(&skip); + } + else + { + rvEmitCall(rvAsm, target); + } + } + rvAsm = m_far_emitter.get(); +} + +void CPU::NewRec::RISCV64Compiler::SwitchToNearCode(bool emit_jump) +{ + DebugAssert(rvAsm == m_far_emitter.get()); + if (emit_jump) + rvEmitJmp(rvAsm, m_emitter->GetCursorPointer()); + rvAsm = m_emitter.get(); +} + +void CPU::NewRec::RISCV64Compiler::EmitMov(const biscuit::GPR& dst, u32 val) +{ + rvEmitMov(rvAsm, dst, val); +} + +void CPU::NewRec::RISCV64Compiler::EmitCall(const void* ptr) +{ + rvEmitCall(rvAsm, ptr); +} + +void CPU::NewRec::RISCV64Compiler::SafeImmSExtIType(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm, + void (biscuit::Assembler::*iop)(GPR, GPR, u32), + void (biscuit::Assembler::*rop)(GPR, GPR, GPR)) +{ + DebugAssert(rd != RSCRATCH && rs != RSCRATCH); + + if (rvIsValidSExtITypeImm(imm)) + { + (rvAsm->*iop)(rd, rs, imm); + return; + } + + rvEmitMov(rvAsm, RSCRATCH, imm); + (rvAsm->*rop)(rd, rs, RSCRATCH); +} + +void CPU::NewRec::RISCV64Compiler::SafeADDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm) +{ + SafeImmSExtIType(rd, rs, imm, reinterpret_cast(&Assembler::ADDI), + &Assembler::ADD); +} + +void CPU::NewRec::RISCV64Compiler::SafeADDIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm) +{ + SafeImmSExtIType(rd, rs, imm, reinterpret_cast(&Assembler::ADDIW), + &Assembler::ADDW); +} + +void CPU::NewRec::RISCV64Compiler::SafeSUBIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm) +{ + const u32 nimm = static_cast(-static_cast(imm)); + SafeImmSExtIType(rd, rs, nimm, reinterpret_cast(&Assembler::ADDIW), + &Assembler::ADDW); +} + +void CPU::NewRec::RISCV64Compiler::SafeANDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm) +{ + SafeImmSExtIType(rd, rs, imm, &Assembler::ANDI, &Assembler::AND); +} + +void CPU::NewRec::RISCV64Compiler::SafeORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm) +{ + SafeImmSExtIType(rd, rs, imm, &Assembler::ORI, &Assembler::OR); +} + +void CPU::NewRec::RISCV64Compiler::SafeXORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm) +{ + SafeImmSExtIType(rd, rs, imm, &Assembler::XORI, &Assembler::XOR); +} + +void CPU::NewRec::RISCV64Compiler::SafeSLTI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm) +{ + SafeImmSExtIType(rd, rs, imm, reinterpret_cast(&Assembler::SLTI), + &Assembler::SLT); +} + +void CPU::NewRec::RISCV64Compiler::SafeSLTIU(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm) +{ + SafeImmSExtIType(rd, rs, imm, reinterpret_cast(&Assembler::SLTIU), + &Assembler::SLTU); +} + +void CPU::NewRec::RISCV64Compiler::EmitSExtB(const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvEmitSExtB(rvAsm, rd, rs); +} + +void CPU::NewRec::RISCV64Compiler::EmitUExtB(const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvEmitUExtB(rvAsm, rd, rs); +} + +void CPU::NewRec::RISCV64Compiler::EmitSExtH(const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvEmitSExtH(rvAsm, rd, rs); +} + +void CPU::NewRec::RISCV64Compiler::EmitUExtH(const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvEmitUExtH(rvAsm, rd, rs); +} + +void CPU::NewRec::RISCV64Compiler::EmitDSExtW(const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvEmitDSExtW(rvAsm, rd, rs); +} + +void CPU::NewRec::RISCV64Compiler::EmitDUExtW(const biscuit::GPR& rd, const biscuit::GPR& rs) +{ + rvEmitDUExtW(rvAsm, rd, rs); +} + +void CPU::NewRec::RISCV64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) +{ + // store it first to reduce code size, because we can offset + // TODO: 64-bit displacement is needed :/ + // rvMoveAddressToReg(rvAsm, RARG1, ram_ptr); + // rvMoveAddressToReg(rvAsm, RARG2, shadow_ptr); + rvEmitMov64(rvAsm, RARG1, RSCRATCH, static_cast(reinterpret_cast(ram_ptr))); + rvEmitMov64(rvAsm, RARG2, RSCRATCH, static_cast(reinterpret_cast(shadow_ptr))); + + u32 offset = 0; + Label block_changed; + + while (size >= 8) + { + rvAsm->LD(RARG3, offset, RARG1); + rvAsm->LD(RSCRATCH, offset, RARG2); + rvAsm->BNE(RARG3, RSCRATCH, &block_changed); + offset += 8; + size -= 8; + } + + while (size >= 4) + { + rvAsm->LWU(RARG3, offset, RARG1); + rvAsm->LWU(RSCRATCH, offset, RARG2); + rvAsm->BNE(RARG3, RSCRATCH, &block_changed); + offset += 4; + size -= 4; + } + + DebugAssert(size == 0); + + Label block_unchanged; + rvAsm->J(&block_unchanged); + rvAsm->Bind(&block_changed); + rvEmitJmp(rvAsm, CodeCache::g_discard_and_recompile_block); + rvAsm->Bind(&block_unchanged); +} + +void CPU::NewRec::RISCV64Compiler::GenerateICacheCheckAndUpdate() +{ + if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1) + { + rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); + SafeADDIW(RARG1, RARG1, static_cast(m_block->uncached_fetch_ticks)); + rvAsm->SW(RARG1, PTR(&g_state.pending_ticks)); + } + else + { + const auto& ticks_reg = RARG1; + const auto& current_tag_reg = RARG2; + const auto& existing_tag_reg = RARG3; + + VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK; + rvAsm->LW(ticks_reg, PTR(&g_state.pending_ticks)); + rvEmitMov(rvAsm, current_tag_reg, current_pc); + + for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) + { + const TickCount fill_ticks = GetICacheFillTicks(current_pc); + if (fill_ticks <= 0) + continue; + + const u32 line = GetICacheLine(current_pc); + const u32 offset = offsetof(State, icache_tags) + (line * sizeof(u32)); + + // TODO: Verify sign extension here... + Label cache_hit; + rvAsm->LW(existing_tag_reg, offset, RSTATE); + rvAsm->BEQ(existing_tag_reg, current_tag_reg, &cache_hit); + + rvAsm->SW(current_tag_reg, offset, RSTATE); + SafeADDIW(ticks_reg, ticks_reg, static_cast(fill_ticks)); + rvAsm->Bind(&cache_hit); + + if (i != (m_block->icache_line_count - 1)) + SafeADDIW(current_tag_reg, current_tag_reg, ICACHE_LINE_SIZE); + } + + rvAsm->SW(ticks_reg, PTR(&g_state.pending_ticks)); + } +} + +void CPU::NewRec::RISCV64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/, + s32 arg3reg /*= -1*/) +{ + if (arg1reg >= 0 && arg1reg != static_cast(RARG1.Index())) + rvAsm->MV(RARG1, GPR(arg1reg)); + if (arg1reg >= 0 && arg2reg != static_cast(RARG2.Index())) + rvAsm->MV(RARG2, GPR(arg2reg)); + if (arg1reg >= 0 && arg3reg != static_cast(RARG3.Index())) + rvAsm->MV(RARG3, GPR(arg3reg)); + EmitCall(func); +} + +void CPU::NewRec::RISCV64Compiler::EndBlock(const std::optional& newpc, bool do_event_test) +{ + if (newpc.has_value()) + { + if (m_dirty_pc || m_compiler_pc != newpc) + { + EmitMov(RSCRATCH, newpc.value()); + rvAsm->SW(RSCRATCH, PTR(&g_state.pc)); + } + } + m_dirty_pc = false; + + // flush regs + Flush(FLUSH_END_BLOCK); + EndAndLinkBlock(newpc, do_event_test); +} + +void CPU::NewRec::RISCV64Compiler::EndBlockWithException(Exception excode) +{ + // flush regs, but not pc, it's going to get overwritten + // flush cycles because of the GTE instruction stuff... + Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION); + + // TODO: flush load delay + // TODO: break for pcdrv + + EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false, + inst->cop.cop_n)); + EmitMov(RARG2, m_current_instruction_pc); + EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException))); + m_dirty_pc = false; + + EndAndLinkBlock(std::nullopt, true); +} + +void CPU::NewRec::RISCV64Compiler::EndAndLinkBlock(const std::optional& newpc, bool do_event_test) +{ + // event test + // pc should've been flushed + DebugAssert(!m_dirty_pc); + + // TODO: try extracting this to a function + // TODO: move the cycle flush in here.. + + // save cycles for event test + const TickCount cycles = std::exchange(m_cycles, 0); + + // pending_ticks += cycles + // if (pending_ticks >= downcount) { dispatch_event(); } + if (do_event_test || m_gte_done_cycle > cycles || cycles > 0) + rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); + if (do_event_test) + rvAsm->LW(RARG2, PTR(&g_state.downcount)); + if (cycles > 0) + { + SafeADDIW(RARG1, RARG1, cycles); + rvAsm->SW(RARG1, PTR(&g_state.pending_ticks)); + } + if (m_gte_done_cycle > cycles) + { + SafeADDIW(RARG2, RARG1, m_gte_done_cycle - cycles); + rvAsm->SW(RARG1, PTR(&g_state.gte_completion_tick)); + } + + if (do_event_test) + { + // TODO: see if we can do a far jump somehow with this.. + Label cont; + rvAsm->BLT(RARG1, RARG2, &cont); + rvEmitJmp(rvAsm, CodeCache::g_run_events_and_dispatch); + rvAsm->Bind(&cont); + } + + // jump to dispatcher or next block + if (!newpc.has_value()) + { + rvEmitJmp(rvAsm, CodeCache::g_dispatcher); + } + else + { + if (newpc.value() == m_block->pc) + { + // Special case: ourselves! No need to backlink then. + Log_DebugPrintf("Linking block at %08X to self", m_block->pc); + rvEmitJmp(rvAsm, rvAsm->GetBufferPointer(0)); + } + else + { + const void* target = CreateBlockLink(m_block, rvAsm->GetCursorPointer(), newpc.value()); + rvEmitJmp(rvAsm, target); + } + } + + m_block_ended = true; +} + +const void* CPU::NewRec::RISCV64Compiler::EndCompile(u32* code_size, u32* far_code_size) +{ + u8* const code = m_emitter->GetBufferPointer(0); + *code_size = static_cast(m_emitter->GetCodeBuffer().GetSizeInBytes()); + *far_code_size = static_cast(m_far_emitter->GetCodeBuffer().GetSizeInBytes()); + rvAsm = nullptr; + m_far_emitter.reset(); + m_emitter.reset(); + return code; +} + +const char* CPU::NewRec::RISCV64Compiler::GetHostRegName(u32 reg) const +{ + static constexpr std::array reg64_names = { + {"zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0", "s1", "a0", "a1", "a2", "a3", "a4", "a5", + "a6", "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "t3", "t4", "t5", "t6"}}; + return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN"; +} + +void CPU::NewRec::RISCV64Compiler::LoadHostRegWithConstant(u32 reg, u32 val) +{ + EmitMov(GPR(reg), val); +} + +void CPU::NewRec::RISCV64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr) +{ + rvAsm->LW(GPR(reg), PTR(ptr)); +} + +void CPU::NewRec::RISCV64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr) +{ + rvAsm->SW(GPR(reg), PTR(ptr)); +} + +void CPU::NewRec::RISCV64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr) +{ + if (val == 0) + { + rvAsm->SW(zero, PTR(ptr)); + return; + } + + EmitMov(RSCRATCH, val); + rvAsm->SW(RSCRATCH, PTR(ptr)); +} + +void CPU::NewRec::RISCV64Compiler::CopyHostReg(u32 dst, u32 src) +{ + if (src != dst) + rvAsm->MV(GPR(dst), GPR(src)); +} + +void CPU::NewRec::RISCV64Compiler::AssertRegOrConstS(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_s || cf.const_s); +} + +void CPU::NewRec::RISCV64Compiler::AssertRegOrConstT(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_t || cf.const_t); +} + +biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetSafeRegS(CompileFlags cf, const biscuit::GPR& temp_reg) +{ + if (cf.valid_host_s) + { + return GPR(cf.host_s); + } + else if (cf.const_s) + { + if (HasConstantRegValue(cf.MipsS(), 0)) + return zero; + + EmitMov(temp_reg, GetConstantRegU32(cf.MipsS())); + return temp_reg; + } + else + { + Log_WarningPrintf("Hit memory path in CFGetSafeRegS() for %s", GetRegName(cf.MipsS())); + rvAsm->LW(temp_reg, PTR(&g_state.regs.r[cf.mips_s])); + return temp_reg; + } +} + +biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetSafeRegT(CompileFlags cf, const biscuit::GPR& temp_reg) +{ + if (cf.valid_host_t) + { + return GPR(cf.host_t); + } + else if (cf.const_t) + { + if (HasConstantRegValue(cf.MipsT(), 0)) + return zero; + + EmitMov(temp_reg, GetConstantRegU32(cf.MipsT())); + return temp_reg; + } + else + { + Log_WarningPrintf("Hit memory path in CFGetSafeRegT() for %s", GetRegName(cf.MipsT())); + rvAsm->LW(temp_reg, PTR(&g_state.regs.r[cf.mips_t])); + return temp_reg; + } +} + +biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegD(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_d); + return GPR(cf.host_d); +} + +biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegS(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_s); + return GPR(cf.host_s); +} + +biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegT(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_t); + return GPR(cf.host_t); +} + +biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegLO(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_lo); + return GPR(cf.host_lo); +} + +biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegHI(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_hi); + return GPR(cf.host_hi); +} + +void CPU::NewRec::RISCV64Compiler::MoveSToReg(const biscuit::GPR& dst, CompileFlags cf) +{ + if (cf.valid_host_s) + { + if (cf.host_s != dst.Index()) + rvAsm->MV(dst, GPR(cf.host_s)); + } + else if (cf.const_s) + { + EmitMov(dst, GetConstantRegU32(cf.MipsS())); + } + else + { + Log_WarningPrintf("Hit memory path in MoveSToReg() for %s", GetRegName(cf.MipsS())); + rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s])); + } +} + +void CPU::NewRec::RISCV64Compiler::MoveTToReg(const biscuit::GPR& dst, CompileFlags cf) +{ + if (cf.valid_host_t) + { + if (cf.host_t != dst.Index()) + rvAsm->MV(dst, GPR(cf.host_t)); + } + else if (cf.const_t) + { + EmitMov(dst, GetConstantRegU32(cf.MipsT())); + } + else + { + Log_WarningPrintf("Hit memory path in MoveTToReg() for %s", GetRegName(cf.MipsT())); + rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_t])); + } +} + +void CPU::NewRec::RISCV64Compiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg) +{ + DebugAssert(reg < Reg::count); + if (const std::optional hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg)) + rvAsm->MV(dst, GPR(hreg.value())); + else if (HasConstantReg(reg)) + EmitMov(dst, GetConstantRegU32(reg)); + else + rvAsm->LW(dst, PTR(&g_state.regs.r[static_cast(reg)])); +} + +void CPU::NewRec::RISCV64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, + Reg arg2reg /* = Reg::count */, + Reg arg3reg /* = Reg::count */) +{ + DebugAssert(g_settings.gpu_pgxp_enable); + + Flush(FLUSH_FOR_C_CALL); + + if (arg2reg != Reg::count) + MoveMIPSRegToReg(RARG2, arg2reg); + if (arg3reg != Reg::count) + MoveMIPSRegToReg(RARG3, arg3reg); + + EmitMov(RARG1, arg1val); + EmitCall(func); +} + +void CPU::NewRec::RISCV64Compiler::Flush(u32 flags) +{ + Compiler::Flush(flags); + + if (flags & FLUSH_PC && m_dirty_pc) + { + StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc); + m_dirty_pc = false; + } + + if (flags & FLUSH_INSTRUCTION_BITS) + { + // This sucks, but it's only used for fallbacks. + Panic("Not implemented"); + } + + if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty) + { + // This sucks :( + // TODO: make it a function? + rvAsm->LBU(RARG1, PTR(&g_state.load_delay_reg)); + rvAsm->LW(RARG2, PTR(&g_state.load_delay_value)); + rvAsm->SLLI(RARG1, RARG1, 2); // *4 + rvAsm->ADD(RARG1, RARG1, RSTATE); + rvAsm->SW(RARG2, offsetof(CPU::State, regs.r[0]), RARG1); + rvAsm->LI(RSCRATCH, static_cast(Reg::count)); + rvAsm->SB(RSCRATCH, PTR(&g_state.load_delay_reg)); + m_load_delay_dirty = false; + } + + if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count) + { + if (m_load_delay_value_register != NUM_HOST_REGS) + FreeHostReg(m_load_delay_value_register); + + EmitMov(RSCRATCH, static_cast(m_load_delay_register)); + rvAsm->SB(RSCRATCH, PTR(&g_state.load_delay_reg)); + m_load_delay_register = Reg::count; + m_load_delay_dirty = true; + } + + if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle) + { + // May as well flush cycles while we're here. + // GTE spanning blocks is very rare, we _could_ disable this for speed. + rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); + rvAsm->LW(RARG2, PTR(&g_state.gte_completion_tick)); + if (m_cycles > 0) + { + SafeADDIW(RARG1, RARG1, m_cycles); + m_cycles = 0; + } + Label no_stall; + rvAsm->BGE(RARG1, RARG2, &no_stall); + rvAsm->MV(RARG1, RARG2); + rvAsm->Bind(&no_stall); + rvAsm->SW(RARG1, PTR(&g_state.pending_ticks)); + m_dirty_gte_done_cycle = false; + } + + if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles) + { + rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); + + // update cycles at the same time + if (flags & FLUSH_CYCLES && m_cycles > 0) + { + SafeADDIW(RARG1, RARG1, m_cycles); + rvAsm->SW(RARG1, PTR(&g_state.pending_ticks)); + m_gte_done_cycle -= m_cycles; + m_cycles = 0; + } + + SafeADDIW(RARG1, RARG1, m_gte_done_cycle); + rvAsm->SW(RARG1, PTR(&g_state.gte_completion_tick)); + m_gte_done_cycle = 0; + m_dirty_gte_done_cycle = true; + } + + if (flags & FLUSH_CYCLES && m_cycles > 0) + { + rvAsm->LW(RARG1, PTR(&g_state.pending_ticks)); + SafeADDIW(RARG1, RARG1, m_cycles); + rvAsm->SW(RARG1, PTR(&g_state.pending_ticks)); + m_gte_done_cycle = std::max(m_gte_done_cycle - m_cycles, 0); + m_cycles = 0; + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_Fallback() +{ + Flush(FLUSH_FOR_INTERPRETER); + +#if 0 + cg->call(&CPU::Recompiler::Thunks::InterpretInstruction); + + // TODO: make me less garbage + // TODO: this is wrong, it flushes the load delay on the same cycle when we return. + // but nothing should be going through here.. + Label no_load_delay; + cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]); + cg->cmp(RWARG1, static_cast(Reg::count)); + cg->je(no_load_delay, CodeGenerator::T_SHORT); + cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]); + cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1); + cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2); + cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast(Reg::count)); + cg->L(no_load_delay); + + m_load_delay_dirty = EMULATE_LOAD_DELAYS; +#else + Panic("Fixme"); +#endif +} + +void CPU::NewRec::RISCV64Compiler::CheckBranchTarget(const biscuit::GPR& pcreg) +{ + if (!g_settings.cpu_recompiler_memory_exceptions) + return; + + DebugAssert(pcreg != RSCRATCH); + rvAsm->ANDI(RSCRATCH, pcreg, 0x3); + SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero); + + BackupHostState(); + EndBlockWithException(Exception::AdEL); + + RestoreHostState(); + SwitchToNearCode(false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_jr(CompileFlags cf) +{ + const GPR pcreg = CFGetRegS(cf); + CheckBranchTarget(pcreg); + + rvAsm->SW(pcreg, PTR(&g_state.pc)); + + CompileBranchDelaySlot(false); + EndBlock(std::nullopt, true); +} + +void CPU::NewRec::RISCV64Compiler::Compile_jalr(CompileFlags cf) +{ + const GPR pcreg = CFGetRegS(cf); + if (MipsD() != Reg::zero) + SetConstantReg(MipsD(), GetBranchReturnAddress(cf)); + + CheckBranchTarget(pcreg); + rvAsm->SW(pcreg, PTR(&g_state.pc)); + + CompileBranchDelaySlot(false); + EndBlock(std::nullopt, true); +} + +void CPU::NewRec::RISCV64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond) +{ + AssertRegOrConstS(cf); + + const u32 taken_pc = GetConditionalBranchTarget(cf); + + Flush(FLUSH_FOR_BRANCH); + + DebugAssert(cf.valid_host_s); + + // MipsT() here should equal zero for zero branches. + DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero); + + Label taken; + const GPR rs = CFGetRegS(cf); + switch (cond) + { + case BranchCondition::Equal: + case BranchCondition::NotEqual: + { + AssertRegOrConstT(cf); + if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0)) + { + (cond == BranchCondition::Equal) ? rvAsm->BEQZ(rs, &taken) : rvAsm->BNEZ(rs, &taken); + } + else + { + const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG1; + if (!cf.valid_host_t) + MoveTToReg(RARG1, cf); + if (cond == Compiler::BranchCondition::Equal) + rvAsm->BEQ(rs, rt, &taken); + else + rvAsm->BNE(rs, rt, &taken); + } + } + break; + + case BranchCondition::GreaterThanZero: + { + rvAsm->BGTZ(rs, &taken); + } + break; + + case BranchCondition::GreaterEqualZero: + { + rvAsm->BGEZ(rs, &taken); + } + break; + + case BranchCondition::LessThanZero: + { + rvAsm->BLTZ(rs, &taken); + } + break; + + case BranchCondition::LessEqualZero: + { + rvAsm->BLEZ(rs, &taken); + } + break; + } + + BackupHostState(); + if (!cf.delay_slot_swapped) + CompileBranchDelaySlot(); + + EndBlock(m_compiler_pc, true); + + rvAsm->Bind(&taken); + + RestoreHostState(); + if (!cf.delay_slot_swapped) + CompileBranchDelaySlot(); + + EndBlock(taken_pc, true); +} + +void CPU::NewRec::RISCV64Compiler::Compile_addi(CompileFlags cf, bool overflow) +{ + const GPR rs = CFGetRegS(cf); + const GPR rt = CFGetRegT(cf); + if (const u32 imm = inst->i.imm_sext32(); imm != 0) + { + if (!overflow) + { + SafeADDIW(rt, rs, imm); + } + else + { + SafeADDI(RARG1, rs, imm); + SafeADDIW(rt, rs, imm); + TestOverflow(RARG1, rt, rt); + } + } + else if (rt.Index() != rs.Index()) + { + rvAsm->MV(rt, rs); + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_addi(CompileFlags cf) +{ + Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions); +} + +void CPU::NewRec::RISCV64Compiler::Compile_addiu(CompileFlags cf) +{ + Compile_addi(cf, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_slti(CompileFlags cf) +{ + Compile_slti(cf, true); +} + +void CPU::NewRec::RISCV64Compiler::Compile_sltiu(CompileFlags cf) +{ + Compile_slti(cf, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_slti(CompileFlags cf, bool sign) +{ + if (sign) + SafeSLTI(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32()); + else + SafeSLTIU(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32()); +} + +void CPU::NewRec::RISCV64Compiler::Compile_andi(CompileFlags cf) +{ + const GPR rt = CFGetRegT(cf); + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + SafeANDI(rt, CFGetRegS(cf), imm); + else + EmitMov(rt, 0); +} + +void CPU::NewRec::RISCV64Compiler::Compile_ori(CompileFlags cf) +{ + const GPR rt = CFGetRegT(cf); + const GPR rs = CFGetRegS(cf); + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + SafeORI(rt, rs, imm); + else if (rt.Index() != rs.Index()) + rvAsm->MV(rt, rs); +} + +void CPU::NewRec::RISCV64Compiler::Compile_xori(CompileFlags cf) +{ + const GPR rt = CFGetRegT(cf); + const GPR rs = CFGetRegS(cf); + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + SafeXORI(rt, rs, imm); + else if (rt.Index() != rs.Index()) + rvAsm->MV(rt, rs); +} + +void CPU::NewRec::RISCV64Compiler::Compile_shift( + CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR), + void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned)) +{ + const GPR rd = CFGetRegD(cf); + const GPR rt = CFGetRegT(cf); + if (inst->r.shamt > 0) + (rvAsm->*op_const)(rd, rt, inst->r.shamt); + else if (rd.Index() != rt.Index()) + rvAsm->MV(rd, rt); +} + +void CPU::NewRec::RISCV64Compiler::Compile_sll(CompileFlags cf) +{ + Compile_shift(cf, &Assembler::SLLW, &Assembler::SLLIW); +} + +void CPU::NewRec::RISCV64Compiler::Compile_srl(CompileFlags cf) +{ + Compile_shift(cf, &Assembler::SRLW, &Assembler::SRLIW); +} + +void CPU::NewRec::RISCV64Compiler::Compile_sra(CompileFlags cf) +{ + Compile_shift(cf, &Assembler::SRAW, &Assembler::SRAIW); +} + +void CPU::NewRec::RISCV64Compiler::Compile_variable_shift( + CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR), + void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned)) +{ + const GPR rd = CFGetRegD(cf); + + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + if (cf.const_s) + { + if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0) + (rvAsm->*op_const)(rd, rt, shift & 31u); + else if (rd.Index() != rt.Index()) + rvAsm->MV(rd, rt); + } + else + { + (rvAsm->*op)(rd, rt, CFGetRegS(cf)); + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_sllv(CompileFlags cf) +{ + Compile_variable_shift(cf, &Assembler::SLLW, &Assembler::SLLIW); +} + +void CPU::NewRec::RISCV64Compiler::Compile_srlv(CompileFlags cf) +{ + Compile_variable_shift(cf, &Assembler::SRLW, &Assembler::SRLIW); +} + +void CPU::NewRec::RISCV64Compiler::Compile_srav(CompileFlags cf) +{ + Compile_variable_shift(cf, &Assembler::SRAW, &Assembler::SRAIW); +} + +void CPU::NewRec::RISCV64Compiler::Compile_mult(CompileFlags cf, bool sign) +{ + const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1; + if (!cf.valid_host_s) + MoveSToReg(rs, cf); + + const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + // TODO: if lo/hi gets killed, we can use a 32-bit multiply + const GPR lo = CFGetRegLO(cf); + const GPR hi = CFGetRegHI(cf); + + if (sign) + { + rvAsm->MUL(lo, rs, rt); + rvAsm->SRAI64(hi, lo, 32); + EmitDSExtW(lo, lo); + } + else + { + // Need to make it unsigned. + EmitDUExtW(RARG1, rs); + EmitDUExtW(RARG2, rt); + rvAsm->MUL(lo, RARG1, RARG2); + rvAsm->SRAI64(hi, lo, 32); + EmitDSExtW(lo, lo); + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_mult(CompileFlags cf) +{ + Compile_mult(cf, true); +} + +void CPU::NewRec::RISCV64Compiler::Compile_multu(CompileFlags cf) +{ + Compile_mult(cf, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_div(CompileFlags cf) +{ + // 36 Volume I: RISC-V User-Level ISA V2.2 + const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1; + if (!cf.valid_host_s) + MoveSToReg(rs, cf); + + const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + const GPR rlo = CFGetRegLO(cf); + const GPR rhi = CFGetRegHI(cf); + + Label done; + Label not_divide_by_zero; + rvAsm->BNEZ(rt, ¬_divide_by_zero); + rvAsm->MV(rhi, rs); // hi = num + rvAsm->SRAI64(rlo, rs, 63); + rvAsm->ANDI(rlo, rlo, 2); + rvAsm->ADDI(rlo, rlo, -1); // lo = s >= 0 ? -1 : 1 + rvAsm->J(&done); + + rvAsm->Bind(¬_divide_by_zero); + Label not_unrepresentable; + EmitMov(RSCRATCH, static_cast(-1)); + rvAsm->BNE(rt, RSCRATCH, ¬_unrepresentable); + EmitMov(rlo, 0x80000000u); + rvAsm->BNE(rs, rlo, ¬_unrepresentable); + EmitMov(rhi, 0); + rvAsm->J(&done); + + rvAsm->Bind(¬_unrepresentable); + + rvAsm->DIVW(rlo, rs, rt); + rvAsm->REMW(rhi, rs, rt); + + rvAsm->Bind(&done); +} + +void CPU::NewRec::RISCV64Compiler::Compile_divu(CompileFlags cf) +{ + const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1; + if (!cf.valid_host_s) + MoveSToReg(rs, cf); + + const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + const GPR rlo = CFGetRegLO(cf); + const GPR rhi = CFGetRegHI(cf); + + // Semantics match? :-) + rvAsm->DIVUW(rlo, rs, rt); + rvAsm->REMUW(rhi, rs, rt); +} + +void CPU::NewRec::RISCV64Compiler::TestOverflow(const biscuit::GPR& long_res, const biscuit::GPR& res, + const biscuit::GPR& reg_to_discard) +{ + SwitchToFarCode(true, &Assembler::BEQ, long_res, res); + + BackupHostState(); + + // toss the result + ClearHostReg(reg_to_discard.Index()); + + EndBlockWithException(Exception::Ov); + + RestoreHostState(); + + SwitchToNearCode(false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_dst_op( + CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR), + void (RISCV64Compiler::*op_const)(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm), + void (biscuit::Assembler::*op_long)(biscuit::GPR, biscuit::GPR, biscuit::GPR), bool commutative, bool overflow) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + const GPR rd = CFGetRegD(cf); + + if (overflow) + { + const GPR rs = CFGetSafeRegS(cf, RARG1); + const GPR rt = CFGetSafeRegT(cf, RARG2); + (rvAsm->*op)(RARG3, rs, rt); + (rvAsm->*op_long)(rd, rs, rt); + TestOverflow(RARG3, rd, rd); + return; + } + + if (cf.valid_host_s && cf.valid_host_t) + { + (rvAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf)); + } + else if (commutative && (cf.const_s || cf.const_t)) + { + const GPR src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf); + if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0) + { + (this->*op_const)(rd, src, cv); + } + else + { + if (rd.Index() != src.Index()) + rvAsm->MV(rd, src); + overflow = false; + } + } + else if (cf.const_s) + { + if (HasConstantRegValue(cf.MipsS(), 0)) + { + (rvAsm->*op)(rd, zero, CFGetRegT(cf)); + } + else + { + EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS())); + (rvAsm->*op)(rd, RSCRATCH, CFGetRegT(cf)); + } + } + else if (cf.const_t) + { + const GPR rs = CFGetRegS(cf); + if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0) + { + (this->*op_const)(rd, rs, cv); + } + else + { + if (rd.Index() != rs.Index()) + rvAsm->MV(rd, rs); + overflow = false; + } + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_add(CompileFlags cf) +{ + Compile_dst_op(cf, &Assembler::ADDW, &RISCV64Compiler::SafeADDIW, &Assembler::ADD, true, + g_settings.cpu_recompiler_memory_exceptions); +} + +void CPU::NewRec::RISCV64Compiler::Compile_addu(CompileFlags cf) +{ + Compile_dst_op(cf, &Assembler::ADDW, &RISCV64Compiler::SafeADDIW, &Assembler::ADD, true, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_sub(CompileFlags cf) +{ + Compile_dst_op(cf, &Assembler::SUBW, &RISCV64Compiler::SafeSUBIW, &Assembler::SUB, false, + g_settings.cpu_recompiler_memory_exceptions); +} + +void CPU::NewRec::RISCV64Compiler::Compile_subu(CompileFlags cf) +{ + Compile_dst_op(cf, &Assembler::SUBW, &RISCV64Compiler::SafeSUBIW, &Assembler::SUB, false, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_and(CompileFlags cf) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + // special cases - and with self -> self, and with 0 -> 0 + const GPR regd = CFGetRegD(cf); + if (cf.MipsS() == cf.MipsT()) + { + rvAsm->MV(regd, CFGetRegS(cf)); + return; + } + else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) + { + EmitMov(regd, 0); + return; + } + + Compile_dst_op(cf, &Assembler::AND, &RISCV64Compiler::SafeANDI, &Assembler::AND, true, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_or(CompileFlags cf) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + // or/nor with 0 -> no effect + const GPR regd = CFGetRegD(cf); + if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT()) + { + cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); + return; + } + + Compile_dst_op(cf, &Assembler::OR, &RISCV64Compiler::SafeORI, &Assembler::OR, true, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_xor(CompileFlags cf) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + const GPR regd = CFGetRegD(cf); + if (cf.MipsS() == cf.MipsT()) + { + // xor with self -> zero + EmitMov(regd, 0); + return; + } + else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) + { + // xor with zero -> no effect + cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); + return; + } + + Compile_dst_op(cf, &Assembler::XOR, &RISCV64Compiler::SafeXORI, &Assembler::XOR, true, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_nor(CompileFlags cf) +{ + Compile_or(cf); + rvAsm->NOT(CFGetRegD(cf), CFGetRegD(cf)); +} + +void CPU::NewRec::RISCV64Compiler::Compile_slt(CompileFlags cf) +{ + Compile_slt(cf, true); +} + +void CPU::NewRec::RISCV64Compiler::Compile_sltu(CompileFlags cf) +{ + Compile_slt(cf, false); +} + +void CPU::NewRec::RISCV64Compiler::Compile_slt(CompileFlags cf, bool sign) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + + const GPR rd = CFGetRegD(cf); + const GPR rs = CFGetSafeRegS(cf, RARG1); + + if (cf.const_t && rvIsValidSExtITypeImm(GetConstantRegU32(cf.MipsT()))) + { + if (sign) + rvAsm->SLTI(rd, rs, GetConstantRegS32(cf.MipsT())); + else + rvAsm->SLTIU(rd, rs, GetConstantRegS32(cf.MipsT())); + } + else + { + const GPR rt = CFGetSafeRegT(cf, RARG2); + if (sign) + rvAsm->SLT(rd, rs, rt); + else + rvAsm->SLTU(rd, rs, rt); + } +} + +biscuit::GPR CPU::NewRec::RISCV64Compiler::ComputeLoadStoreAddressArg( + CompileFlags cf, const std::optional& address, const std::optional& reg) +{ + const u32 imm = inst->i.imm_sext32(); + if (cf.valid_host_s && imm == 0 && !reg.has_value()) + return CFGetRegS(cf); + + const GPR dst = reg.has_value() ? reg.value() : RARG1; + if (address.has_value()) + { + EmitMov(dst, address.value()); + } + else if (imm == 0) + { + if (cf.valid_host_s) + { + if (const GPR src = CFGetRegS(cf); src.Index() != dst.Index()) + rvAsm->MV(dst, CFGetRegS(cf)); + } + else + { + rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s])); + } + } + else + { + if (cf.valid_host_s) + { + SafeADDIW(dst, CFGetRegS(cf), inst->i.imm_sext32()); + } + else + { + rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s])); + SafeADDIW(dst, dst, inst->i.imm_sext32()); + } + } + + return dst; +} + +template +void CPU::NewRec::RISCV64Compiler::GenerateLoad(const biscuit::GPR& addr_reg, MemoryAccessSize size, bool sign, + const RegAllocFn& dst_reg_alloc) +{ + const bool checked = g_settings.cpu_recompiler_memory_exceptions; + if (!checked && CodeCache::IsUsingFastmem()) + { + m_cycles += Bus::RAM_READ_TICKS; + + // TODO: Make this better. If we're loading the address from state, we can use LWU instead, and skip this. + // TODO: LUT fastmem + const GPR dst = dst_reg_alloc(); + rvAsm->SLLI64(RSCRATCH, addr_reg, 32); + rvAsm->SRLI64(RSCRATCH, RSCRATCH, 32); + + if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) + { + DebugAssert(addr_reg.Index() != RARG3.Index()); + rvAsm->SRLI64(RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT); + rvAsm->SLLI64(RARG3, RARG3, 8); + rvAsm->ADD(RARG3, RARG3, RMEMBASE); + rvAsm->LD(RARG3, 0, RARG3); + rvAsm->ADD(RSCRATCH, RSCRATCH, RARG3); + } + else + { + rvAsm->ADD(RSCRATCH, RSCRATCH, RMEMBASE); + } + + u8* start = m_emitter->GetCursorPointer(); + switch (size) + { + case MemoryAccessSize::Byte: + sign ? rvAsm->LB(dst, 0, RSCRATCH) : rvAsm->LBU(dst, 0, RSCRATCH); + break; + + case MemoryAccessSize::HalfWord: + sign ? rvAsm->LH(dst, 0, RSCRATCH) : rvAsm->LHU(dst, 0, RSCRATCH); + break; + + case MemoryAccessSize::Word: + rvAsm->LW(dst, 0, RSCRATCH); + break; + } + + // We need a nop, because the slowmem jump might be more than 1MB away. + rvAsm->NOP(); + + AddLoadStoreInfo(start, 8, addr_reg.Index(), dst.Index(), size, sign, true); + return; + } + + if (addr_reg.Index() != RARG1.Index()) + rvAsm->MV(RARG1, addr_reg); + + switch (size) + { + case MemoryAccessSize::Byte: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte)); + } + break; + case MemoryAccessSize::HalfWord: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord)); + } + break; + case MemoryAccessSize::Word: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord)); + } + break; + } + + // TODO: turn this into an asm function instead + if (checked) + { + rvAsm->SRLI64(RSCRATCH, RRET, 63); + SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero); + BackupHostState(); + + // Need to stash this in a temp because of the flush. + const GPR temp = GPR(AllocateTempHostReg(HR_CALLEE_SAVED)); + rvAsm->NEG(temp, RRET); + rvAsm->SLLIW(temp, temp, 2); + + Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); + + // cause_bits = (-result << 2) | BD | cop_n + SafeORI(RARG1, temp, + Cop0Registers::CAUSE::MakeValueForException( + static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); + EmitMov(RARG2, m_current_instruction_pc); + EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException))); + FreeHostReg(temp.Index()); + EndBlock(std::nullopt, true); + + RestoreHostState(); + SwitchToNearCode(false); + } + + const GPR dst_reg = dst_reg_alloc(); + switch (size) + { + case MemoryAccessSize::Byte: + { + sign ? EmitSExtB(dst_reg, RRET) : EmitUExtB(dst_reg, RRET); + } + break; + case MemoryAccessSize::HalfWord: + { + sign ? EmitSExtH(dst_reg, RRET) : EmitUExtH(dst_reg, RRET); + } + break; + case MemoryAccessSize::Word: + { + // Need to undo the zero-extend. + if (checked) + rvEmitDSExtW(rvAsm, dst_reg, RRET); + else if (dst_reg.Index() != RRET.Index()) + rvAsm->MV(dst_reg, RRET); + } + break; + } +} + +void CPU::NewRec::RISCV64Compiler::GenerateStore(const biscuit::GPR& addr_reg, const biscuit::GPR& value_reg, + MemoryAccessSize size) +{ + const bool checked = g_settings.cpu_recompiler_memory_exceptions; + if (!checked && CodeCache::IsUsingFastmem()) + { + DebugAssert(value_reg != RSCRATCH); + rvAsm->SLLI64(RSCRATCH, addr_reg, 32); + rvAsm->SRLI64(RSCRATCH, RSCRATCH, 32); + + if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) + { + DebugAssert(addr_reg.Index() != RARG3.Index()); + rvAsm->SRLI64(RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT); + rvAsm->SLLI64(RARG3, RARG3, 8); + rvAsm->ADD(RARG3, RARG3, RMEMBASE); + rvAsm->LD(RARG3, 0, RARG3); + rvAsm->ADD(RSCRATCH, RSCRATCH, RARG3); + } + else + { + rvAsm->ADD(RSCRATCH, RSCRATCH, RMEMBASE); + } + + u8* start = m_emitter->GetCursorPointer(); + switch (size) + { + case MemoryAccessSize::Byte: + rvAsm->SB(value_reg, 0, RSCRATCH); + break; + + case MemoryAccessSize::HalfWord: + rvAsm->SH(value_reg, 0, RSCRATCH); + break; + + case MemoryAccessSize::Word: + rvAsm->SW(value_reg, 0, RSCRATCH); + break; + } + + // We need a nop, because the slowmem jump might be more than 1MB away. + rvAsm->NOP(); + + AddLoadStoreInfo(start, 8, addr_reg.Index(), value_reg.Index(), size, false, false); + return; + } + + if (addr_reg.Index() != RARG1.Index()) + rvAsm->MV(RARG1, addr_reg); + if (value_reg.Index() != RARG2.Index()) + rvAsm->MV(RARG2, value_reg); + + switch (size) + { + case MemoryAccessSize::Byte: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte)); + } + break; + case MemoryAccessSize::HalfWord: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); + } + break; + case MemoryAccessSize::Word: + { + EmitCall(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord)); + } + break; + } + + // TODO: turn this into an asm function instead + if (checked) + { + SwitchToFarCode(true, &Assembler::BEQ, RRET, zero); + BackupHostState(); + + // Need to stash this in a temp because of the flush. + const GPR temp = GPR(AllocateTempHostReg(HR_CALLEE_SAVED)); + rvAsm->SLLIW(temp, RRET, 2); + + Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); + + // cause_bits = (result << 2) | BD | cop_n + SafeORI(RARG1, temp, + Cop0Registers::CAUSE::MakeValueForException( + static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); + EmitMov(RARG2, m_current_instruction_pc); + EmitCall(reinterpret_cast(static_cast(&CPU::RaiseException))); + FreeHostReg(temp.Index()); + EndBlock(std::nullopt, true); + + RestoreHostState(); + SwitchToNearCode(false); + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + FlushForLoadStore(address, false); + const GPR addr = ComputeLoadStoreAddressArg(cf, address); + GenerateLoad(addr, size, sign, [this, cf]() { + if (cf.MipsT() == Reg::zero) + return RRET; + + return GPR(AllocateHostReg(HR_MODE_WRITE, EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, + cf.MipsT())); + }); +} + +void CPU::NewRec::RISCV64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + DebugAssert(size == MemoryAccessSize::Word && !sign); + FlushForLoadStore(address, false); + + // TODO: if address is constant, this can be simplified.. + + // If we're coming from another block, just flush the load delay and hope for the best.. + if (m_load_delay_dirty) + UpdateLoadDelay(); + + // We'd need to be careful here if we weren't overwriting it.. + const GPR addr = GPR(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP)); + ComputeLoadStoreAddressArg(cf, address, addr); + rvAsm->ANDI(RARG1, addr, ~0x3u); + GenerateLoad(RARG1, MemoryAccessSize::Word, false, []() { return RRET; }); + + if (inst->r.rt == Reg::zero) + { + FreeHostReg(addr.Index()); + return; + } + + // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is + // never written back. NOTE: can't trust T in cf because of the flush + const Reg rt = inst->r.rt; + GPR value; + if (m_load_delay_register == rt) + { + const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ? + AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) : + m_load_delay_value_register; + RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt); + value = GPR(existing_ld_rt); + } + else + { + if constexpr (EMULATE_LOAD_DELAYS) + { + value = GPR(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt)); + if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) + rvAsm->MV(value, GPR(rtreg.value())); + else if (HasConstantReg(rt)) + EmitMov(value, GetConstantRegU32(rt)); + else + rvAsm->LW(value, PTR(&g_state.regs.r[static_cast(rt)])); + } + else + { + value = GPR(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt)); + } + } + + DebugAssert(value.Index() != RARG2.Index() && value.Index() != RARG3.Index()); + rvAsm->ANDI(RARG2, addr, 3); + rvAsm->SLLIW(RARG2, RARG2, 3); // *8 + EmitMov(RARG3, 24); + rvAsm->SUBW(RARG3, RARG3, RARG2); + + if (inst->op == InstructionOp::lwl) + { + // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; + // new_value = (value & mask) | (RWRET << (24 - shift)); + EmitMov(addr, 0xFFFFFFu); + rvAsm->SRLW(addr, addr, RARG2); + rvAsm->AND(value, value, addr); + rvAsm->SLLW(RRET, RRET, RARG3); + rvAsm->OR(value, value, RRET); + } + else + { + // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); + // new_value = (value & mask) | (RWRET >> shift); + rvAsm->SRLW(RRET, RRET, RARG2); + EmitMov(addr, 0xFFFFFF00u); + rvAsm->SLLW(addr, addr, RARG3); + rvAsm->AND(value, value, addr); + rvAsm->OR(value, value, RRET); + } + + FreeHostReg(addr.Index()); +} + +void CPU::NewRec::RISCV64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + FlushForLoadStore(address, false); + const GPR addr = ComputeLoadStoreAddressArg(cf, address); + GenerateLoad(addr, MemoryAccessSize::Word, false, []() { return RRET; }); + + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, true); + switch (action) + { + case GTERegisterAccessAction::Ignore: + { + return; + } + + case GTERegisterAccessAction::Direct: + { + rvAsm->SW(RRET, PTR(ptr)); + return; + } + + case GTERegisterAccessAction::SignExtend16: + { + EmitSExtH(RRET, RRET); + rvAsm->SW(RRET, PTR(ptr)); + return; + } + + case GTERegisterAccessAction::ZeroExtend16: + { + EmitUExtH(RRET, RRET); + rvAsm->SW(RRET, PTR(ptr)); + return; + } + + case GTERegisterAccessAction::CallHandler: + { + Flush(FLUSH_FOR_C_CALL); + rvAsm->MV(RARG2, RRET); + EmitMov(RARG1, index); + EmitCall(reinterpret_cast(>E::WriteRegister)); + return; + } + + case GTERegisterAccessAction::PushFIFO: + { + // SXY0 <- SXY1 + // SXY1 <- SXY2 + // SXY2 <- SXYP + DebugAssert(RRET.Index() != RARG2.Index() && RRET.Index() != RARG3.Index()); + rvAsm->LW(RARG2, PTR(&g_state.gte_regs.SXY1[0])); + rvAsm->LW(RARG3, PTR(&g_state.gte_regs.SXY2[0])); + rvAsm->SW(RARG2, PTR(&g_state.gte_regs.SXY0[0])); + rvAsm->SW(RARG3, PTR(&g_state.gte_regs.SXY1[0])); + rvAsm->SW(RRET, PTR(&g_state.gte_regs.SXY2[0])); + return; + } + + default: + { + Panic("Unknown action"); + return; + } + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + AssertRegOrConstS(cf); + AssertRegOrConstT(cf); + FlushForLoadStore(address, true); + const GPR addr = ComputeLoadStoreAddressArg(cf, address); + + if (!cf.valid_host_t) + MoveTToReg(RARG2, cf); + + GenerateStore(addr, cf.valid_host_t ? CFGetRegT(cf) : RARG2, size); +} + +void CPU::NewRec::RISCV64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + DebugAssert(size == MemoryAccessSize::Word && !sign); + FlushForLoadStore(address, true); + + // TODO: if address is constant, this can be simplified.. + // We'd need to be careful here if we weren't overwriting it.. + const GPR addr = GPR(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP)); + ComputeLoadStoreAddressArg(cf, address, addr); + rvAsm->ANDI(RARG1, addr, ~0x3u); + GenerateLoad(RARG1, MemoryAccessSize::Word, false, []() { return RRET; }); + + // TODO: this can take over rt's value if it's no longer needed + // NOTE: can't trust T in cf because of the flush + const Reg rt = inst->r.rt; + const GPR value = RARG2; + if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) + rvAsm->MV(value, GPR(rtreg.value())); + else if (HasConstantReg(rt)) + EmitMov(value, GetConstantRegU32(rt)); + else + rvAsm->LW(value, PTR(&g_state.regs.r[static_cast(rt)])); + + rvAsm->ANDI(RSCRATCH, addr, 3); + rvAsm->SLLIW(RSCRATCH, RSCRATCH, 3); // *8 + + if (inst->op == InstructionOp::swl) + { + // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; + // new_value = (RWRET & mem_mask) | (value >> (24 - shift)); + EmitMov(RARG3, 0xFFFFFF00u); + rvAsm->SLLW(RARG3, RARG3, RSCRATCH); + rvAsm->AND(RRET, RRET, RARG3); + + EmitMov(RARG3, 24); + rvAsm->SUBW(RARG3, RARG3, RSCRATCH); + rvAsm->SRLW(value, value, RARG3); + rvAsm->OR(value, value, RRET); + } + else + { + // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); + // new_value = (RWRET & mem_mask) | (value << shift); + rvAsm->SLLW(value, value, RSCRATCH); + + EmitMov(RARG3, 24); + rvAsm->SUBW(RARG3, RARG3, RSCRATCH); + EmitMov(RSCRATCH, 0x00FFFFFFu); + rvAsm->SRLW(RSCRATCH, RSCRATCH, RARG3); + rvAsm->AND(RRET, RRET, RSCRATCH); + rvAsm->OR(value, value, RRET); + } + + FreeHostReg(addr.Index()); + + rvAsm->ANDI(RARG1, addr, ~0x3u); + GenerateStore(RARG1, value, MemoryAccessSize::Word); +} + +void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + FlushForLoadStore(address, true); + + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, false); + switch (action) + { + case GTERegisterAccessAction::Direct: + { + rvAsm->LW(RARG2, PTR(ptr)); + } + break; + + case GTERegisterAccessAction::CallHandler: + { + // should already be flushed.. except in fastmem case + Flush(FLUSH_FOR_C_CALL); + EmitMov(RARG1, index); + EmitCall(reinterpret_cast(>E::ReadRegister)); + rvAsm->MV(RARG2, RRET); + } + break; + + default: + { + Panic("Unknown action"); + } + break; + } + + const GPR addr = ComputeLoadStoreAddressArg(cf, address); + GenerateStore(addr, RARG2, size); +} + +void CPU::NewRec::RISCV64Compiler::Compile_mtc0(CompileFlags cf) +{ + // TODO: we need better constant setting here.. which will need backprop + AssertRegOrConstT(cf); + + const Cop0Reg reg = static_cast(MipsD()); + const u32* ptr = GetCop0RegPtr(reg); + const u32 mask = GetCop0RegWriteMask(reg); + if (!ptr) + { + Compile_Fallback(); + return; + } + + if (mask == 0) + { + // if it's a read-only register, ignore + Log_DebugPrintf("Ignoring write to read-only cop0 reg %u", static_cast(reg)); + return; + } + + // for some registers, we need to test certain bits + const bool needs_bit_test = (reg == Cop0Reg::SR); + const GPR new_value = RARG1; + const GPR old_value = RARG2; + const GPR changed_bits = RARG3; + const GPR mask_reg = RSCRATCH; + + // Load old value + rvAsm->LW(old_value, PTR(ptr)); + + // No way we fit this in an immediate.. + EmitMov(mask_reg, mask); + + // update value + // TODO: This is creating pointless MV instructions.. why? + if (cf.valid_host_t) + rvAsm->AND(new_value, CFGetRegT(cf), mask_reg); + else + EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask); + + if (needs_bit_test) + rvAsm->XOR(changed_bits, old_value, new_value); + rvAsm->NOT(mask_reg, mask_reg); + rvAsm->AND(old_value, old_value, mask_reg); + rvAsm->OR(new_value, old_value, new_value); + rvAsm->SW(new_value, PTR(ptr)); + + if (reg == Cop0Reg::SR) + { + // TODO: replace with register backup + // We could just inline the whole thing.. + Flush(FLUSH_FOR_C_CALL); + + rvAsm->SRLIW(RSCRATCH, changed_bits, 16); + rvAsm->ANDI(RSCRATCH, RSCRATCH, 1); + SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero); + rvAsm->ADDI(sp, sp, -16); + rvAsm->SW(RARG1, 0, sp); + rvAsm->SW(RARG2, 8, sp); + EmitCall(reinterpret_cast(&CPU::UpdateMemoryPointers)); + rvAsm->SW(RARG2, 8, sp); + rvAsm->SW(RARG1, 0, sp); + rvAsm->ADDI(sp, sp, 16); + rvAsm->LD(RMEMBASE, PTR(&g_state.fastmem_base)); + SwitchToNearCode(true); + } + + if (reg == Cop0Reg::SR || reg == Cop0Reg::CAUSE) + { + const GPR sr = (reg == Cop0Reg::SR) ? RARG2 : (rvAsm->LW(RARG1, PTR(&g_state.cop0_regs.sr.bits)), RARG1); + TestInterrupts(sr); + } + + if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions) + { + // TODO: DCIC handling for debug breakpoints + Log_WarningPrintf("TODO: DCIC handling for debug breakpoints"); + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_rfe(CompileFlags cf) +{ + // shift mode bits right two, preserving upper bits + rvAsm->LW(RARG1, PTR(&g_state.cop0_regs.sr.bits)); + rvAsm->SRLIW(RSCRATCH, RARG1, 2); + rvAsm->ANDI(RSCRATCH, RSCRATCH, 0xf); + rvAsm->ANDI(RARG1, RARG1, ~0xfu); + rvAsm->OR(RARG1, RARG1, RSCRATCH); + rvAsm->SW(RARG1, PTR(&g_state.cop0_regs.sr.bits)); + + TestInterrupts(RARG1); +} + +void CPU::NewRec::RISCV64Compiler::TestInterrupts(const biscuit::GPR& sr) +{ + DebugAssert(sr != RSCRATCH); + + // if Iec == 0 then goto no_interrupt + Label no_interrupt; + rvAsm->ANDI(RSCRATCH, sr, 1); + rvAsm->BEQZ(RSCRATCH, &no_interrupt); + + // sr & cause + rvAsm->LW(RSCRATCH, PTR(&g_state.cop0_regs.cause.bits)); + rvAsm->AND(sr, sr, RSCRATCH); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + rvAsm->SRLIW(sr, sr, 8); + rvAsm->ANDI(sr, sr, 0xFF); + SwitchToFarCode(true, &Assembler::BEQ, sr, zero); + BackupHostState(); + Flush(FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL); + EmitCall(reinterpret_cast(&DispatchInterrupt)); + EndBlock(std::nullopt, true); + RestoreHostState(); + SwitchToNearCode(false); + + rvAsm->Bind(&no_interrupt); +} + +void CPU::NewRec::RISCV64Compiler::Compile_mfc2(CompileFlags cf) +{ + const u32 index = inst->cop.Cop2Index(); + const Reg rt = inst->r.rt; + + const auto [ptr, action] = GetGTERegisterPointer(index, false); + if (action == GTERegisterAccessAction::Ignore) + return; + + if (action == GTERegisterAccessAction::Direct) + { + const u32 hreg = + AllocateHostReg(HR_MODE_WRITE, EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); + rvAsm->LW(GPR(hreg), PTR(ptr)); + } + else if (action == GTERegisterAccessAction::CallHandler) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RARG1, index); + EmitCall(reinterpret_cast(>E::ReadRegister)); + + const u32 hreg = + AllocateHostReg(HR_MODE_WRITE, EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); + rvAsm->MV(GPR(hreg), RRET); + } + else + { + Panic("Unknown action"); + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_mtc2(CompileFlags cf) +{ + const u32 index = inst->cop.Cop2Index(); + const auto [ptr, action] = GetGTERegisterPointer(index, true); + if (action == GTERegisterAccessAction::Ignore) + return; + + if (action == GTERegisterAccessAction::Direct) + { + if (cf.const_t) + StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr); + else + rvAsm->SW(CFGetRegT(cf), PTR(ptr)); + } + else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16) + { + const bool sign = (action == GTERegisterAccessAction::SignExtend16); + if (cf.valid_host_t) + { + sign ? EmitSExtH(RARG1, CFGetRegT(cf)) : EmitUExtH(RARG1, CFGetRegT(cf)); + rvAsm->SW(RARG1, PTR(ptr)); + } + else if (cf.const_t) + { + const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT())); + StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr); + } + else + { + Panic("Unsupported setup"); + } + } + else if (action == GTERegisterAccessAction::CallHandler) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RARG1, index); + MoveTToReg(RARG2, cf); + EmitCall(reinterpret_cast(>E::WriteRegister)); + } + else if (action == GTERegisterAccessAction::PushFIFO) + { + // SXY0 <- SXY1 + // SXY1 <- SXY2 + // SXY2 <- SXYP + DebugAssert(RRET.Index() != RARG2.Index() && RRET.Index() != RARG3.Index()); + rvAsm->LW(RARG2, PTR(&g_state.gte_regs.SXY1[0])); + rvAsm->LW(RARG3, PTR(&g_state.gte_regs.SXY2[0])); + rvAsm->SW(RARG2, PTR(&g_state.gte_regs.SXY0[0])); + rvAsm->SW(RARG3, PTR(&g_state.gte_regs.SXY1[0])); + if (cf.valid_host_t) + rvAsm->SW(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0])); + else if (cf.const_t) + StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]); + else + Panic("Unsupported setup"); + } + else + { + Panic("Unknown action"); + } +} + +void CPU::NewRec::RISCV64Compiler::Compile_cop2(CompileFlags cf) +{ + TickCount func_ticks; + GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks); + + Flush(FLUSH_FOR_C_CALL); + EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK); + EmitCall(reinterpret_cast(func)); + + AddGTETicks(func_ticks); +} + +u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size, + TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask, + u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed, + bool is_load) +{ + Assembler arm_asm(static_cast(thunk_code), thunk_space); + Assembler* rvAsm = &arm_asm; + + static constexpr u32 GPR_SIZE = 8; + + // save regs + u32 num_gprs = 0; + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i)) + num_gprs++; + } + + const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE); + + if (stack_size > 0) + { + rvAsm->ADDI(sp, sp, -static_cast(stack_size)); + + u32 stack_offset = 0; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i)) + { + rvAsm->SD(GPR(i), stack_offset, sp); + stack_offset += GPR_SIZE; + } + } + } + + if (cycles_to_add != 0) + { + // NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles + Assert(rvIsValidSExtITypeImm(cycles_to_add)); + rvAsm->LW(RSCRATCH, PTR(&g_state.pending_ticks)); + rvAsm->ADDIW(RSCRATCH, RSCRATCH, cycles_to_add); + rvAsm->SW(RSCRATCH, PTR(&g_state.pending_ticks)); + } + + if (address_register != RARG1.Index()) + rvAsm->MV(RARG1, GPR(address_register)); + + if (!is_load) + { + if (data_register != RARG2.Index()) + rvAsm->MV(RARG2, GPR(data_register)); + } + + switch (size) + { + case MemoryAccessSize::Byte: + { + rvEmitCall(rvAsm, is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte)); + } + break; + case MemoryAccessSize::HalfWord: + { + rvEmitCall(rvAsm, is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); + } + break; + case MemoryAccessSize::Word: + { + rvEmitCall(rvAsm, is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord)); + } + break; + } + + if (is_load) + { + const GPR dst = GPR(data_register); + switch (size) + { + case MemoryAccessSize::Byte: + { + is_signed ? rvEmitSExtB(rvAsm, dst, RRET) : rvEmitUExtB(rvAsm, dst, RRET); + } + break; + case MemoryAccessSize::HalfWord: + { + is_signed ? rvEmitSExtH(rvAsm, dst, RRET) : rvEmitUExtH(rvAsm, dst, RRET); + } + break; + case MemoryAccessSize::Word: + { + if (dst.Index() != RRET.Index()) + rvAsm->MV(dst, RRET); + } + break; + } + } + + if (cycles_to_remove != 0) + { + Assert(rvIsValidSExtITypeImm(-cycles_to_remove)); + rvAsm->LW(RSCRATCH, PTR(&g_state.pending_ticks)); + rvAsm->ADDIW(RSCRATCH, RSCRATCH, -cycles_to_remove); + rvAsm->SW(RSCRATCH, PTR(&g_state.pending_ticks)); + } + + // restore regs + if (stack_size > 0) + { + u32 stack_offset = 0; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i)) + { + rvAsm->LD(GPR(i), stack_offset, sp); + stack_offset += GPR_SIZE; + } + } + + rvAsm->ADDI(sp, sp, stack_size); + } + + rvEmitJmp(rvAsm, static_cast(code_address) + code_size); + + return static_cast(rvAsm->GetCodeBuffer().GetSizeInBytes()); +} diff --git a/src/core/cpu_newrec_compiler_riscv64.h b/src/core/cpu_newrec_compiler_riscv64.h new file mode 100644 index 000000000..96a265e33 --- /dev/null +++ b/src/core/cpu_newrec_compiler_riscv64.h @@ -0,0 +1,168 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once +#include "cpu_newrec_compiler.h" +#include + +namespace CPU::NewRec { + +class RISCV64Compiler final : public Compiler +{ +public: + RISCV64Compiler(); + ~RISCV64Compiler() override; + +protected: + const char* GetHostRegName(u32 reg) const override; + + const void* GetCurrentCodePointer() override; + + void LoadHostRegWithConstant(u32 reg, u32 val) override; + void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) override; + void StoreConstantToCPUPointer(u32 val, const void* ptr) override; + void StoreHostRegToCPUPointer(u32 reg, const void* ptr) override; + void CopyHostReg(u32 dst, u32 src) override; + + void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer, + u32 far_code_space) override; + void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) override; + void GenerateICacheCheckAndUpdate() override; + void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) override; + void EndBlock(const std::optional& newpc, bool do_event_test) override; + void EndBlockWithException(Exception excode) override; + void EndAndLinkBlock(const std::optional& newpc, bool do_event_test); + const void* EndCompile(u32* code_size, u32* far_code_size) override; + + void Flush(u32 flags) override; + + void Compile_Fallback() override; + + void CheckBranchTarget(const biscuit::GPR& pcreg); + void Compile_jr(CompileFlags cf) override; + void Compile_jalr(CompileFlags cf) override; + void Compile_bxx(CompileFlags cf, BranchCondition cond) override; + + void Compile_addi(CompileFlags cf, bool overflow); + void Compile_addi(CompileFlags cf) override; + void Compile_addiu(CompileFlags cf) override; + void Compile_slti(CompileFlags cf, bool sign); + void Compile_slti(CompileFlags cf) override; + void Compile_sltiu(CompileFlags cf) override; + void Compile_andi(CompileFlags cf) override; + void Compile_ori(CompileFlags cf) override; + void Compile_xori(CompileFlags cf) override; + + void Compile_shift(CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR), + void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned)); + void Compile_sll(CompileFlags cf) override; + void Compile_srl(CompileFlags cf) override; + void Compile_sra(CompileFlags cf) override; + void Compile_variable_shift(CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR), + void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned)); + void Compile_sllv(CompileFlags cf) override; + void Compile_srlv(CompileFlags cf) override; + void Compile_srav(CompileFlags cf) override; + void Compile_mult(CompileFlags cf, bool sign); + void Compile_mult(CompileFlags cf) override; + void Compile_multu(CompileFlags cf) override; + void Compile_div(CompileFlags cf) override; + void Compile_divu(CompileFlags cf) override; + void TestOverflow(const biscuit::GPR& long_res, const biscuit::GPR& res, const biscuit::GPR& reg_to_discard); + void Compile_dst_op(CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR), + void (RISCV64Compiler::*op_const)(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm), + void (biscuit::Assembler::*op_long)(biscuit::GPR, biscuit::GPR, biscuit::GPR), bool commutative, + bool overflow); + void Compile_add(CompileFlags cf) override; + void Compile_addu(CompileFlags cf) override; + void Compile_sub(CompileFlags cf) override; + void Compile_subu(CompileFlags cf) override; + void Compile_and(CompileFlags cf) override; + void Compile_or(CompileFlags cf) override; + void Compile_xor(CompileFlags cf) override; + void Compile_nor(CompileFlags cf) override; + void Compile_slt(CompileFlags cf, bool sign); + void Compile_slt(CompileFlags cf) override; + void Compile_sltu(CompileFlags cf) override; + + biscuit::GPR ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional& address, + const std::optional& reg = std::nullopt); + template + void GenerateLoad(const biscuit::GPR& addr_reg, MemoryAccessSize size, bool sign, const RegAllocFn& dst_reg_alloc); + void GenerateStore(const biscuit::GPR& addr_reg, const biscuit::GPR& value_reg, MemoryAccessSize size); + void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + + void TestInterrupts(const biscuit::GPR& sr); + void Compile_mtc0(CompileFlags cf) override; + void Compile_rfe(CompileFlags cf) override; + + void Compile_mfc2(CompileFlags cf) override; + void Compile_mtc2(CompileFlags cf) override; + void Compile_cop2(CompileFlags cf) override; + + void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count, + Reg arg3reg = Reg::count) override; + +private: + void EmitMov(const biscuit::GPR& dst, u32 val); + void EmitCall(const void* ptr); + + void SwitchToFarCode(bool emit_jump, + void (biscuit::Assembler::*inverted_cond)(biscuit::GPR, biscuit::GPR, biscuit::Label*) = nullptr, + const biscuit::GPR& rs1 = biscuit::zero, const biscuit::GPR& rs2 = biscuit::zero); + void SwitchToNearCode(bool emit_jump); + + void AssertRegOrConstS(CompileFlags cf) const; + void AssertRegOrConstT(CompileFlags cf) const; + // vixl::aarch64::MemOperand MipsPtr(Reg r) const; + + void SafeImmSExtIType(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm, + void (biscuit::Assembler::*iop)(biscuit::GPR, biscuit::GPR, u32), + void (biscuit::Assembler::*rop)(biscuit::GPR, biscuit::GPR, biscuit::GPR)); + + void SafeADDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm); + void SafeADDIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm); + void SafeSUBIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm); + void SafeANDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm); + void SafeORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm); + void SafeXORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm); + void SafeSLTI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm); + void SafeSLTIU(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm); + + void EmitSExtB(const biscuit::GPR& rd, const biscuit::GPR& rs); + void EmitUExtB(const biscuit::GPR& rd, const biscuit::GPR& rs); + void EmitSExtH(const biscuit::GPR& rd, const biscuit::GPR& rs); + void EmitUExtH(const biscuit::GPR& rd, const biscuit::GPR& rs); + void EmitDSExtW(const biscuit::GPR& rd, const biscuit::GPR& rs); + void EmitDUExtW(const biscuit::GPR& rd, const biscuit::GPR& rs); + + biscuit::GPR CFGetSafeRegS(CompileFlags cf, const biscuit::GPR& temp_reg); + biscuit::GPR CFGetSafeRegT(CompileFlags cf, const biscuit::GPR& temp_reg); + + biscuit::GPR CFGetRegD(CompileFlags cf) const; + biscuit::GPR CFGetRegS(CompileFlags cf) const; + biscuit::GPR CFGetRegT(CompileFlags cf) const; + biscuit::GPR CFGetRegLO(CompileFlags cf) const; + biscuit::GPR CFGetRegHI(CompileFlags cf) const; + + void MoveSToReg(const biscuit::GPR& dst, CompileFlags cf); + void MoveTToReg(const biscuit::GPR& dst, CompileFlags cf); + void MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg); + + std::unique_ptr m_emitter; + std::unique_ptr m_far_emitter; + biscuit::Assembler* rvAsm; +}; + +} // namespace CPU::NewRec diff --git a/src/core/cpu_newrec_compiler_x64.cpp b/src/core/cpu_newrec_compiler_x64.cpp new file mode 100644 index 000000000..7f458360b --- /dev/null +++ b/src/core/cpu_newrec_compiler_x64.cpp @@ -0,0 +1,2196 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#include "cpu_newrec_compiler_x64.h" +#include "common/align.h" +#include "common/assert.h" +#include "common/log.h" +#include "common/string_util.h" +#include "cpu_code_cache_private.h" +#include "cpu_core_private.h" +#include "cpu_recompiler_thunks.h" +#include "cpu_recompiler_types.h" +#include "gte.h" +#include "pgxp.h" +#include "settings.h" +#include "timing_event.h" +#include +Log_SetChannel(CPU::NewRec); + +#define RMEMBASE cg->rbx +#define RSTATE cg->rbp + +// #define PTR(x) (cg->rip + (x)) +#define PTR(x) (RSTATE + (u32)(((u8*)(x)) - ((u8*)&g_state))) + +// PGXP TODO: LWL etc, MFC0 +// PGXP TODO: Spyro 1 level gates have issues. + +static constexpr u32 BACKPATCH_JMP_SIZE = 5; + +using namespace Xbyak; + +using CPU::Recompiler::IsCallerSavedRegister; + +// TODO: try using a pointer to state instead of rip-relative.. it might end up faster due to smaller code + +namespace CPU::NewRec { +X64Compiler s_instance; +Compiler* g_compiler = &s_instance; +} // namespace CPU::NewRec + +CPU::NewRec::X64Compiler::X64Compiler() = default; + +CPU::NewRec::X64Compiler::~X64Compiler() = default; + +void CPU::NewRec::X64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, + u8* far_code_buffer, u32 far_code_space) +{ + Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space); + + // TODO: don't recreate this every time.. + DebugAssert(!m_emitter && !m_far_emitter && !cg); + m_emitter = std::make_unique(code_buffer_space, code_buffer); + m_far_emitter = std::make_unique(far_code_space, far_code_buffer); + cg = m_emitter.get(); + + // Need to wipe it out so it's correct when toggling fastmem. + m_host_regs = {}; + + const u32 membase_idx = CodeCache::IsUsingFastmem() ? static_cast(RMEMBASE.getIdx()) : NUM_HOST_REGS; + const u32 cpu_idx = static_cast(RSTATE.getIdx()); + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + HostRegAlloc& ra = m_host_regs[i]; + + if (i == static_cast(RWRET.getIdx()) || i == static_cast(RWARG1.getIdx()) || + i == static_cast(RWARG2.getIdx()) || i == static_cast(RWARG3.getIdx()) || + i == static_cast(cg->rsp.getIdx()) || i == cpu_idx || i == membase_idx || + i == static_cast(cg->ecx.getIdx()) /* keep ecx free for shifts, maybe use BMI? */) + { + continue; + } + + ra.flags = HR_USABLE | (IsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED); + } +} + +void CPU::NewRec::X64Compiler::SwitchToFarCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*)) +{ + DebugAssert(cg == m_emitter.get()); + if (emit_jump) + { + const void* fcptr = m_far_emitter->getCurr(); + (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr); + } + cg = m_far_emitter.get(); +} + +void CPU::NewRec::X64Compiler::SwitchToNearCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*)) +{ + DebugAssert(cg == m_far_emitter.get()); + if (emit_jump) + { + const void* fcptr = m_emitter->getCurr(); + (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr); + } + cg = m_emitter.get(); +} + +void CPU::NewRec::X64Compiler::BeginBlock() +{ + Compiler::BeginBlock(); + +#if 0 + if (m_block->pc == 0xBFC06F0C) + { + //__debugbreak(); + cg->db(0xcc); + } +#endif + +#if 0 + cg->nop(); + cg->mov(RWARG1, m_block->pc); + cg->nop(); +#endif +} + +void CPU::NewRec::X64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) +{ + // store it first to reduce code size, because we can offset + cg->mov(RXARG1, static_cast(reinterpret_cast(ram_ptr))); + cg->mov(RXARG2, static_cast(reinterpret_cast(shadow_ptr))); + + bool first = true; + u32 offset = 0; + while (size >= 16) + { + const Xbyak::Xmm& dst = first ? cg->xmm0 : cg->xmm1; + cg->movups(dst, cg->xword[RXARG1 + offset]); + cg->pcmpeqd(dst, cg->xword[RXARG2 + offset]); + if (!first) + cg->pand(cg->xmm0, dst); + else + first = false; + + offset += 16; + size -= 16; + } + + // TODO: better codegen for 16 byte aligned blocks + if (!first) + { + cg->movmskps(cg->eax, cg->xmm0); + cg->cmp(cg->eax, 0xf); + cg->jne(CodeCache::g_discard_and_recompile_block); + } + + while (size >= 8) + { + cg->mov(RXARG3, cg->qword[RXARG1 + offset]); + cg->cmp(RXARG3, cg->qword[RXARG2 + offset]); + cg->jne(CodeCache::g_discard_and_recompile_block); + offset += 8; + size -= 8; + } + + while (size >= 4) + { + cg->mov(RWARG3, cg->dword[RXARG1 + offset]); + cg->cmp(RWARG3, cg->dword[RXARG2 + offset]); + cg->jne(CodeCache::g_discard_and_recompile_block); + offset += 4; + size -= 4; + } + + DebugAssert(size == 0); +} + +void CPU::NewRec::X64Compiler::GenerateICacheCheckAndUpdate() +{ + if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1) + { + cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast(m_block->uncached_fetch_ticks)); + } + else if (m_block->icache_line_count > 0) + { + cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]); + + // TODO: Vectorize this... + VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK; + for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) + { + const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc); + const TickCount fill_ticks = GetICacheFillTicks(current_pc); + if (fill_ticks <= 0) + continue; + + const u32 line = GetICacheLine(current_pc); + const u32 offset = (line * sizeof(u32)); + Xbyak::Label cache_hit; + + cg->cmp(cg->dword[RXARG1 + offset], tag); + cg->je(cache_hit); + cg->mov(cg->dword[RXARG1 + offset], tag); + cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast(fill_ticks)); + cg->L(cache_hit); + } + } +} + +void CPU::NewRec::X64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/, + s32 arg3reg /*= -1*/) +{ + if (arg1reg >= 0 && arg1reg != static_cast(RXARG1.getIdx())) + cg->mov(RXARG1, Reg64(arg1reg)); + if (arg1reg >= 0 && arg2reg != static_cast(RXARG2.getIdx())) + cg->mov(RXARG2, Reg64(arg2reg)); + if (arg1reg >= 0 && arg3reg != static_cast(RXARG3.getIdx())) + cg->mov(RXARG3, Reg64(arg3reg)); + cg->call(func); +} + +void CPU::NewRec::X64Compiler::EndBlock(const std::optional& newpc, bool do_event_test) +{ + if (newpc.has_value()) + { + if (m_dirty_pc || m_compiler_pc != newpc) + cg->mov(cg->dword[PTR(&g_state.pc)], newpc.value()); + } + m_dirty_pc = false; + + // flush regs + Flush(FLUSH_END_BLOCK); + EndAndLinkBlock(newpc, do_event_test); +} + +void CPU::NewRec::X64Compiler::EndBlockWithException(Exception excode) +{ + // flush regs, but not pc, it's going to get overwritten + // flush cycles because of the GTE instruction stuff... + Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION); + + // TODO: flush load delay + // TODO: break for pcdrv + + cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false, + inst->cop.cop_n)); + cg->mov(RWARG2, m_current_instruction_pc); + cg->call(static_cast(&CPU::RaiseException)); + m_dirty_pc = false; + + EndAndLinkBlock(std::nullopt, true); +} + +void CPU::NewRec::X64Compiler::EndAndLinkBlock(const std::optional& newpc, bool do_event_test) +{ + // event test + // pc should've been flushed + DebugAssert(!m_dirty_pc); + + // TODO: try extracting this to a function + + // save cycles for event test + const TickCount cycles = std::exchange(m_cycles, 0); + + // fast path when not doing an event test + if (!do_event_test && m_gte_done_cycle <= cycles) + { + if (cycles == 1) + cg->inc(cg->dword[PTR(&g_state.pending_ticks)]); + else if (cycles > 0) + cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles); + } + else + { + // pending_ticks += cycles + // if (pending_ticks >= downcount) { dispatch_event(); } + if (do_event_test || cycles > 0 || m_gte_done_cycle > cycles) + cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); + if (cycles > 0) + cg->add(RWARG1, cycles); + if (m_gte_done_cycle > cycles) + { + cg->mov(RWARG2, RWARG1); + ((m_gte_done_cycle - cycles) == 1) ? cg->inc(RWARG2) : cg->add(RWARG2, m_gte_done_cycle - cycles); + cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG2); + } + if (do_event_test) + cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]); + if (cycles > 0) + cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); + if (do_event_test) + cg->jge(CodeCache::g_run_events_and_dispatch); + } + + // jump to dispatcher or next block + if (!newpc.has_value()) + { + cg->jmp(CodeCache::g_dispatcher); + } + else + { + if (newpc.value() == m_block->pc) + { + // Special case: ourselves! No need to backlink then. + Log_DebugPrintf("Linking block at %08X to self", m_block->pc); + cg->jmp(cg->getCode()); + } + else + { + const void* target = CodeCache::CreateBlockLink(m_block, cg->getCurr(), newpc.value()); + cg->jmp(target, CodeGenerator::T_NEAR); + } + } + + m_block_ended = true; +} + +const void* CPU::NewRec::X64Compiler::EndCompile(u32* code_size, u32* far_code_size) +{ + const void* code = m_emitter->getCode(); + *code_size = static_cast(m_emitter->getSize()); + *far_code_size = static_cast(m_far_emitter->getSize()); + cg = nullptr; + m_far_emitter.reset(); + m_emitter.reset(); + return code; +} + +const void* CPU::NewRec::X64Compiler::GetCurrentCodePointer() +{ + return cg->getCurr(); +} + +const char* CPU::NewRec::X64Compiler::GetHostRegName(u32 reg) const +{ + static constexpr std::array reg64_names = { + {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"}}; + return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN"; +} + +void CPU::NewRec::X64Compiler::LoadHostRegWithConstant(u32 reg, u32 val) +{ + cg->mov(Reg32(reg), val); +} + +void CPU::NewRec::X64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr) +{ + cg->mov(Reg32(reg), cg->dword[PTR(ptr)]); +} + +void CPU::NewRec::X64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr) +{ + cg->mov(cg->dword[PTR(ptr)], Reg32(reg)); +} + +void CPU::NewRec::X64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr) +{ + cg->mov(cg->dword[PTR(ptr)], val); +} + +void CPU::NewRec::X64Compiler::CopyHostReg(u32 dst, u32 src) +{ + if (src != dst) + cg->mov(Reg32(dst), Reg32(src)); +} + +Xbyak::Address CPU::NewRec::X64Compiler::MipsPtr(Reg r) const +{ + DebugAssert(r < Reg::count); + return cg->dword[PTR(&g_state.regs.r[static_cast(r)])]; +} + +Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegD(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_d); + return Reg32(cf.host_d); +} + +Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegS(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_s); + return Reg32(cf.host_s); +} + +Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegT(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_t); + return Reg32(cf.host_t); +} + +Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegLO(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_lo); + return Reg32(cf.host_lo); +} + +Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegHI(CompileFlags cf) const +{ + DebugAssert(cf.valid_host_hi); + return Reg32(cf.host_hi); +} + +Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToD(CompileFlags cf) +{ + DebugAssert(cf.valid_host_d); + DebugAssert(!cf.valid_host_t || cf.host_t != cf.host_d); + + const Reg32 rd = CFGetRegD(cf); + MoveSToReg(rd, cf); + + return rd; +} + +Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToT(CompileFlags cf) +{ + DebugAssert(cf.valid_host_t); + + const Reg32 rt = CFGetRegT(cf); + if (cf.valid_host_s) + { + const Reg32 rs = CFGetRegS(cf); + if (rt != rs) + cg->mov(rt, rs); + } + else if (cf.const_s) + { + if (const u32 cv = GetConstantRegU32(cf.MipsS()); cv != 0) + cg->mov(rt, cv); + else + cg->xor_(rt, rt); + } + else + { + cg->mov(rt, MipsPtr(cf.MipsS())); + } + + return rt; +} + +Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveTToD(CompileFlags cf) +{ + DebugAssert(cf.valid_host_d); + DebugAssert(!cf.valid_host_s || cf.host_s != cf.host_d); + + const Reg32 rd = CFGetRegD(cf); + MoveTToReg(rd, cf); + return rd; +} + +void CPU::NewRec::X64Compiler::MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf) +{ + if (cf.valid_host_s) + { + if (cf.host_s != static_cast(dst.getIdx())) + cg->mov(dst, Reg32(cf.host_s)); + } + else if (cf.const_s) + { + const u32 cv = GetConstantRegU32(cf.MipsS()); + if (cv == 0) + cg->xor_(dst, dst); + else + cg->mov(dst, cv); + } + else + { + cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_s])]); + } +} + +void CPU::NewRec::X64Compiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf) +{ + if (cf.valid_host_t) + { + if (cf.host_t != static_cast(dst.getIdx())) + cg->mov(dst, Reg32(cf.host_t)); + } + else if (cf.const_t) + { + const u32 cv = GetConstantRegU32(cf.MipsT()); + if (cv == 0) + cg->xor_(dst, dst); + else + cg->mov(dst, cv); + } + else + { + cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_t])]); + } +} + +void CPU::NewRec::X64Compiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg) +{ + DebugAssert(reg < Reg::count); + if (const std::optional hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg)) + cg->mov(dst, Reg32(hreg.value())); + else if (HasConstantReg(reg)) + cg->mov(dst, GetConstantRegU32(reg)); + else + cg->mov(dst, MipsPtr(reg)); +} + +void CPU::NewRec::X64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, + Reg arg2reg /* = Reg::count */, + Reg arg3reg /* = Reg::count */) +{ + DebugAssert(g_settings.gpu_pgxp_enable); + + Flush(FLUSH_FOR_C_CALL); + + if (arg2reg != Reg::count) + MoveMIPSRegToReg(RWARG2, arg2reg); + if (arg3reg != Reg::count) + MoveMIPSRegToReg(RWARG3, arg3reg); + + cg->mov(RWARG1, arg1val); + cg->call(func); +} + +void CPU::NewRec::X64Compiler::Flush(u32 flags) +{ + Compiler::Flush(flags); + + if (flags & FLUSH_PC && m_dirty_pc) + { + cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc); + m_dirty_pc = false; + } + + if (flags & FLUSH_INSTRUCTION_BITS) + { + cg->mov(cg->dword[PTR(&g_state.current_instruction.bits)], inst->bits); + cg->mov(cg->dword[PTR(&g_state.current_instruction_pc)], m_current_instruction_pc); + cg->mov(cg->byte[PTR(&g_state.current_instruction_in_branch_delay_slot)], m_current_instruction_branch_delay_slot); + } + + if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty) + { + // This sucks :( + // TODO: make it a function? + cg->movzx(RWARG1, cg->byte[PTR(&g_state.load_delay_reg)]); + cg->mov(RWARG2, cg->dword[PTR(&g_state.load_delay_value)]); + cg->mov(cg->dword[PTR(&g_state.regs.r[0]) + RXARG1 * 4], RWARG2); + cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast(Reg::count)); + m_load_delay_dirty = false; + } + + if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count) + { + if (m_load_delay_value_register != NUM_HOST_REGS) + FreeHostReg(m_load_delay_value_register); + + cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast(m_load_delay_register)); + m_load_delay_register = Reg::count; + m_load_delay_dirty = true; + } + + if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle) + { + // May as well flush cycles while we're here. + // GTE spanning blocks is very rare, we _could_ disable this for speed. + cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); + cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_completion_tick)]); + if (m_cycles > 0) + { + (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles); + m_cycles = 0; + } + cg->cmp(RWARG2, RWARG1); + cg->cmova(RWARG1, RWARG2); + cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); + m_dirty_gte_done_cycle = false; + } + + if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles) + { + cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); + + // update cycles at the same time + if (flags & FLUSH_CYCLES && m_cycles > 0) + { + (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles); + cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); + m_gte_done_cycle -= m_cycles; + m_cycles = 0; + } + + (m_gte_done_cycle == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_gte_done_cycle); + cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG1); + m_gte_done_cycle = 0; + m_dirty_gte_done_cycle = true; + } + + if (flags & FLUSH_CYCLES && m_cycles > 0) + { + (m_cycles == 1) ? cg->inc(cg->dword[PTR(&g_state.pending_ticks)]) : + cg->add(cg->dword[PTR(&g_state.pending_ticks)], m_cycles); + m_gte_done_cycle = std::max(m_gte_done_cycle - m_cycles, 0); + m_cycles = 0; + } +} + +void CPU::NewRec::X64Compiler::Compile_Fallback() +{ + Flush(FLUSH_FOR_INTERPRETER); + + cg->call(&CPU::Recompiler::Thunks::InterpretInstruction); + + // TODO: make me less garbage + // TODO: this is wrong, it flushes the load delay on the same cycle when we return. + // but nothing should be going through here.. + Label no_load_delay; + cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]); + cg->cmp(RWARG1, static_cast(Reg::count)); + cg->je(no_load_delay, CodeGenerator::T_SHORT); + cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]); + cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1); + cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2); + cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast(Reg::count)); + cg->L(no_load_delay); + + m_load_delay_dirty = EMULATE_LOAD_DELAYS; +} + +void CPU::NewRec::X64Compiler::CheckBranchTarget(const Xbyak::Reg32& pcreg) +{ + if (!g_settings.cpu_recompiler_memory_exceptions) + return; + + cg->test(pcreg, 0x3); + SwitchToFarCode(true, &CodeGenerator::jnz); + + BackupHostState(); + EndBlockWithException(Exception::AdEL); + + RestoreHostState(); + SwitchToNearCode(false); +} + +void CPU::NewRec::X64Compiler::Compile_jr(CompileFlags cf) +{ + if (!cf.valid_host_s) + cg->mov(RWARG1, MipsPtr(cf.MipsS())); + + const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; + CheckBranchTarget(pcreg); + + cg->mov(cg->dword[PTR(&g_state.pc)], pcreg); + + CompileBranchDelaySlot(false); + EndBlock(std::nullopt, true); +} + +void CPU::NewRec::X64Compiler::Compile_jalr(CompileFlags cf) +{ + if (!cf.valid_host_s) + cg->mov(RWARG1, MipsPtr(cf.MipsS())); + + const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; + + if (MipsD() != Reg::zero) + SetConstantReg(MipsD(), GetBranchReturnAddress(cf)); + + CheckBranchTarget(pcreg); + cg->mov(cg->dword[PTR(&g_state.pc)], pcreg); + + CompileBranchDelaySlot(false); + EndBlock(std::nullopt, true); +} + +void CPU::NewRec::X64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond) +{ + const u32 taken_pc = GetConditionalBranchTarget(cf); + + Flush(FLUSH_FOR_BRANCH); + + DebugAssert(cf.valid_host_s); + + // MipsT() here should equal zero for zero branches. + DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero); + + // TODO: Swap this back to near once instructions don't blow up + constexpr CodeGenerator::LabelType type = CodeGenerator::T_NEAR; + Label taken; + switch (cond) + { + case BranchCondition::Equal: + case BranchCondition::NotEqual: + { + // we should always have S, maybe not T + // TODO: if it's zero, we can just do test rs, rs + if (cf.valid_host_t) + cg->cmp(CFGetRegS(cf), CFGetRegT(cf)); + else if (cf.const_t) + cg->cmp(CFGetRegS(cf), GetConstantRegU32(cf.MipsT())); + else + cg->cmp(CFGetRegS(cf), MipsPtr(cf.MipsT())); + + (cond == BranchCondition::Equal) ? cg->je(taken, type) : cg->jne(taken, type); + } + break; + + case BranchCondition::GreaterThanZero: + { + cg->cmp(CFGetRegS(cf), 0); + cg->jg(taken, type); + } + break; + + case BranchCondition::GreaterEqualZero: + { + cg->test(CFGetRegS(cf), CFGetRegS(cf)); + cg->jns(taken, type); + } + break; + + case BranchCondition::LessThanZero: + { + cg->test(CFGetRegS(cf), CFGetRegS(cf)); + cg->js(taken, type); + } + break; + + case BranchCondition::LessEqualZero: + { + cg->cmp(CFGetRegS(cf), 0); + cg->jle(taken, type); + } + break; + } + + BackupHostState(); + if (!cf.delay_slot_swapped) + CompileBranchDelaySlot(); + + EndBlock(m_compiler_pc, true); + + cg->L(taken); + + RestoreHostState(); + if (!cf.delay_slot_swapped) + CompileBranchDelaySlot(); + + EndBlock(taken_pc, true); +} + +void CPU::NewRec::X64Compiler::Compile_addi(CompileFlags cf) +{ + const Reg32 rt = MoveSToT(cf); + if (const u32 imm = inst->i.imm_sext32(); imm != 0) + { + cg->add(rt, imm); + if (g_settings.cpu_recompiler_memory_exceptions) + { + DebugAssert(cf.valid_host_t); + TestOverflow(rt); + } + } +} + +void CPU::NewRec::X64Compiler::Compile_addiu(CompileFlags cf) +{ + const Reg32 rt = MoveSToT(cf); + if (const u32 imm = inst->i.imm_sext32(); imm != 0) + cg->add(rt, imm); +} + +void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf) +{ + Compile_slti(cf, true); +} + +void CPU::NewRec::X64Compiler::Compile_sltiu(CompileFlags cf) +{ + Compile_slti(cf, false); +} + +void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf, bool sign) +{ + const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; + + // Case where T == S, can't use xor because it changes flags + if (!cf.valid_host_t || !cf.valid_host_s || cf.host_t != cf.host_s) + cg->xor_(rt, rt); + + if (cf.valid_host_s) + cg->cmp(CFGetRegS(cf), inst->i.imm_sext32()); + else + cg->cmp(MipsPtr(cf.MipsS()), inst->i.imm_sext32()); + + if (cf.valid_host_t && cf.valid_host_s && cf.host_t == cf.host_s) + cg->mov(rt, 0); + + sign ? cg->setl(rt.cvt8()) : cg->setb(rt.cvt8()); + + if (!cf.valid_host_t) + cg->mov(MipsPtr(cf.MipsT()), rt); +} + +void CPU::NewRec::X64Compiler::Compile_andi(CompileFlags cf) +{ + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + { + const Reg32 rt = MoveSToT(cf); + cg->and_(rt, imm); + } + else + { + const Reg32 rt = CFGetRegT(cf); + cg->xor_(rt, rt); + } +} + +void CPU::NewRec::X64Compiler::Compile_ori(CompileFlags cf) +{ + const Reg32 rt = MoveSToT(cf); + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + cg->or_(rt, imm); +} + +void CPU::NewRec::X64Compiler::Compile_xori(CompileFlags cf) +{ + const Reg32 rt = MoveSToT(cf); + if (const u32 imm = inst->i.imm_zext32(); imm != 0) + cg->xor_(rt, imm); +} + +void CPU::NewRec::X64Compiler::Compile_sll(CompileFlags cf) +{ + const Reg32 rd = MoveTToD(cf); + if (inst->r.shamt > 0) + cg->shl(rd, inst->r.shamt); +} + +void CPU::NewRec::X64Compiler::Compile_srl(CompileFlags cf) +{ + const Reg32 rd = MoveTToD(cf); + if (inst->r.shamt > 0) + cg->shr(rd, inst->r.shamt); +} + +void CPU::NewRec::X64Compiler::Compile_sra(CompileFlags cf) +{ + const Reg32 rd = MoveTToD(cf); + if (inst->r.shamt > 0) + cg->sar(rd, inst->r.shamt); +} + +void CPU::NewRec::X64Compiler::Compile_variable_shift( + CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Reg8&), + void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, int)) +{ + const Reg32 rd = CFGetRegD(cf); + if (!cf.const_s) + { + MoveSToReg(cg->ecx, cf); + MoveTToReg(rd, cf); + (cg->*op)(rd, cg->cl); + } + else + { + MoveTToReg(rd, cf); + (cg->*op_const)(rd, GetConstantRegU32(cf.MipsS())); + } +} + +void CPU::NewRec::X64Compiler::Compile_sllv(CompileFlags cf) +{ + Compile_variable_shift(cf, &CodeGenerator::shl, &CodeGenerator::shl); +} + +void CPU::NewRec::X64Compiler::Compile_srlv(CompileFlags cf) +{ + Compile_variable_shift(cf, &CodeGenerator::shr, &CodeGenerator::shr); +} + +void CPU::NewRec::X64Compiler::Compile_srav(CompileFlags cf) +{ + Compile_variable_shift(cf, &CodeGenerator::sar, &CodeGenerator::sar); +} + +void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf, bool sign) +{ + // RAX/RDX shouldn't be allocatable.. + DebugAssert(!(m_host_regs[Xbyak::Operand::RAX].flags & HR_USABLE) && + !(m_host_regs[Xbyak::Operand::RDX].flags & HR_USABLE)); + + MoveSToReg(cg->eax, cf); + if (cf.valid_host_t) + { + sign ? cg->imul(CFGetRegT(cf)) : cg->mul(CFGetRegT(cf)); + } + else if (cf.const_t) + { + cg->mov(cg->edx, GetConstantRegU32(cf.MipsT())); + sign ? cg->imul(cg->edx) : cg->mul(cg->edx); + } + else + { + sign ? cg->imul(MipsPtr(cf.MipsT())) : cg->mul(MipsPtr(cf.MipsT())); + } + + // TODO: skip writeback if it's not needed + if (cf.valid_host_lo) + cg->mov(CFGetRegLO(cf), cg->eax); + else + cg->mov(MipsPtr(Reg::lo), cg->eax); + if (cf.valid_host_lo) + cg->mov(CFGetRegHI(cf), cg->edx); + else + cg->mov(MipsPtr(Reg::hi), cg->edx); +} + +void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf) +{ + Compile_mult(cf, true); +} + +void CPU::NewRec::X64Compiler::Compile_multu(CompileFlags cf) +{ + Compile_mult(cf, false); +} + +void CPU::NewRec::X64Compiler::Compile_div(CompileFlags cf) +{ + // not supported without registers for now.. + DebugAssert(cf.valid_host_lo && cf.valid_host_hi); + + const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + const Reg32 rlo = CFGetRegLO(cf); + const Reg32 rhi = CFGetRegHI(cf); + + MoveSToReg(cg->eax, cf); + cg->cdq(); + + Label done; + Label not_divide_by_zero; + cg->test(rt, rt); + cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT); + cg->test(cg->eax, cg->eax); + cg->mov(rhi, cg->eax); // hi = num + cg->mov(rlo, 1); + cg->mov(cg->eax, static_cast(-1)); + cg->cmovns(rlo, cg->eax); // lo = s >= 0 ? -1 : 1 + cg->jmp(done, CodeGenerator::T_SHORT); + + cg->L(not_divide_by_zero); + Label not_unrepresentable; + cg->cmp(cg->eax, 0x80000000u); + cg->jne(not_unrepresentable, CodeGenerator::T_SHORT); + cg->cmp(rt, static_cast(-1)); + cg->jne(not_unrepresentable, CodeGenerator::T_SHORT); + + cg->mov(rlo, 0x80000000u); + cg->xor_(rhi, rhi); + cg->jmp(done, CodeGenerator::T_SHORT); + + cg->L(not_unrepresentable); + + cg->idiv(rt); + cg->mov(rlo, cg->eax); + cg->mov(rhi, cg->edx); + + cg->L(done); +} + +void CPU::NewRec::X64Compiler::Compile_divu(CompileFlags cf) +{ + // not supported without registers for now.. + DebugAssert(cf.valid_host_lo && cf.valid_host_hi); + + const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx; + if (!cf.valid_host_t) + MoveTToReg(rt, cf); + + const Reg32 rlo = CFGetRegLO(cf); + const Reg32 rhi = CFGetRegHI(cf); + + MoveSToReg(cg->eax, cf); + cg->xor_(cg->edx, cg->edx); + + Label done; + Label not_divide_by_zero; + cg->test(rt, rt); + cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT); + cg->mov(rlo, static_cast(-1)); + cg->mov(rhi, cg->eax); + cg->jmp(done, CodeGenerator::T_SHORT); + + cg->L(not_divide_by_zero); + cg->div(rt); + cg->mov(rlo, cg->eax); + cg->mov(rhi, cg->edx); + + cg->L(done); +} + +void CPU::NewRec::X64Compiler::TestOverflow(const Xbyak::Reg32& result) +{ + SwitchToFarCode(true, &Xbyak::CodeGenerator::jo); + + BackupHostState(); + + // toss the result + ClearHostReg(result.getIdx()); + + EndBlockWithException(Exception::Ov); + + RestoreHostState(); + + SwitchToNearCode(false); +} + +void CPU::NewRec::X64Compiler::Compile_dst_op( + CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&), + void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, u32), bool commutative, bool overflow) +{ + if (cf.valid_host_s && cf.valid_host_t) + { + if (cf.host_d == cf.host_s) + { + (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); + } + else if (cf.host_d == cf.host_t) + { + if (commutative) + { + (cg->*op)(CFGetRegD(cf), CFGetRegS(cf)); + } + else + { + cg->mov(RWARG1, CFGetRegT(cf)); + cg->mov(CFGetRegD(cf), CFGetRegS(cf)); + (cg->*op)(CFGetRegD(cf), RWARG1); + } + } + else + { + cg->mov(CFGetRegD(cf), CFGetRegS(cf)); + (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); + } + } + else if (commutative && (cf.const_s || cf.const_t)) + { + const Reg32 rd = CFGetRegD(cf); + (cf.const_s) ? MoveTToReg(rd, cf) : MoveSToReg(rd, cf); + if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0) + (cg->*op_const)(CFGetRegD(cf), cv); + else + overflow = false; + } + else if (cf.const_s) + { + // need to backup T? + if (cf.valid_host_d && cf.valid_host_t && cf.host_d == cf.host_t) + { + cg->mov(RWARG1, CFGetRegT(cf)); + MoveSToReg(CFGetRegD(cf), cf); + (cg->*op)(CFGetRegD(cf), RWARG1); + } + else + { + MoveSToReg(CFGetRegD(cf), cf); + (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); + } + } + else if (cf.const_t) + { + MoveSToReg(CFGetRegD(cf), cf); + if (const u32 cv = GetConstantRegU32(cf.MipsT()); cv != 0) + (cg->*op_const)(CFGetRegD(cf), cv); + else + overflow = false; + } + else if (cf.valid_host_s) + { + if (cf.host_d != cf.host_s) + cg->mov(CFGetRegD(cf), CFGetRegS(cf)); + (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT())); + } + else if (cf.valid_host_t) + { + if (cf.host_d != cf.host_t) + cg->mov(CFGetRegD(cf), CFGetRegT(cf)); + (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsS())); + } + else + { + cg->mov(CFGetRegD(cf), MipsPtr(cf.MipsS())); + (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT())); + } + + if (overflow) + { + DebugAssert(cf.valid_host_d); + TestOverflow(CFGetRegD(cf)); + } +} + +void CPU::NewRec::X64Compiler::Compile_add(CompileFlags cf) +{ + Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, g_settings.cpu_recompiler_memory_exceptions); +} + +void CPU::NewRec::X64Compiler::Compile_addu(CompileFlags cf) +{ + Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, false); +} + +void CPU::NewRec::X64Compiler::Compile_sub(CompileFlags cf) +{ + Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, g_settings.cpu_recompiler_memory_exceptions); +} + +void CPU::NewRec::X64Compiler::Compile_subu(CompileFlags cf) +{ + Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, false); +} + +void CPU::NewRec::X64Compiler::Compile_and(CompileFlags cf) +{ + // special cases - and with self -> self, and with 0 -> 0 + const Reg32 regd = CFGetRegD(cf); + if (cf.MipsS() == cf.MipsT()) + { + MoveSToReg(regd, cf); + return; + } + else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) + { + cg->xor_(regd, regd); + return; + } + + Compile_dst_op(cf, &CodeGenerator::and_, &CodeGenerator::and_, true, false); +} + +void CPU::NewRec::X64Compiler::Compile_or(CompileFlags cf) +{ + // or/nor with 0 -> no effect + const Reg32 regd = CFGetRegD(cf); + if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT()) + { + cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); + return; + } + + Compile_dst_op(cf, &CodeGenerator::or_, &CodeGenerator::or_, true, false); +} + +void CPU::NewRec::X64Compiler::Compile_xor(CompileFlags cf) +{ + const Reg32 regd = CFGetRegD(cf); + if (cf.MipsS() == cf.MipsT()) + { + // xor with self -> zero + cg->xor_(regd, regd); + return; + } + else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) + { + // xor with zero -> no effect + cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); + return; + } + + Compile_dst_op(cf, &CodeGenerator::xor_, &CodeGenerator::xor_, true, false); +} + +void CPU::NewRec::X64Compiler::Compile_nor(CompileFlags cf) +{ + Compile_or(cf); + cg->not_(CFGetRegD(cf)); +} + +void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf) +{ + Compile_slt(cf, true); +} + +void CPU::NewRec::X64Compiler::Compile_sltu(CompileFlags cf) +{ + Compile_slt(cf, false); +} + +void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf, bool sign) +{ + const Reg32 rd = CFGetRegD(cf); + const Reg32 rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; + const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; + if (!cf.valid_host_s) + MoveSToReg(rs, cf); + + // Case where D == S, can't use xor because it changes flags + // TODO: swap and reverse op for constants + if (rd != rs && rd != rt) + cg->xor_(rd, rd); + + if (cf.valid_host_t) + cg->cmp(rs, CFGetRegT(cf)); + else if (cf.const_t) + cg->cmp(rs, GetConstantRegU32(cf.MipsT())); + else + cg->cmp(rs, MipsPtr(cf.MipsT())); + + if (rd == rs || rd == rt) + cg->mov(rd, 0); + + sign ? cg->setl(rd.cvt8()) : cg->setb(rd.cvt8()); +} + +Xbyak::Reg32 +CPU::NewRec::X64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf, + const std::optional& address, + const std::optional& reg /* = std::nullopt */) +{ + const u32 imm = inst->i.imm_sext32(); + if (cf.valid_host_s && imm == 0 && !reg.has_value()) + return CFGetRegS(cf); + + const Reg32 dst = reg.has_value() ? reg.value() : RWARG1; + if (address.has_value()) + { + cg->mov(dst, address.value()); + } + else + { + if (cf.valid_host_s) + { + if (const Reg32 src = CFGetRegS(cf); src != dst) + cg->mov(dst, CFGetRegS(cf)); + } + else + { + cg->mov(dst, MipsPtr(cf.MipsS())); + } + + if (imm != 0) + cg->add(dst, inst->i.imm_sext32()); + } + + return dst; +} + +template +Xbyak::Reg32 CPU::NewRec::X64Compiler::GenerateLoad(const Xbyak::Reg32& addr_reg, MemoryAccessSize size, bool sign, + const RegAllocFn& dst_reg_alloc) +{ + const bool checked = g_settings.cpu_recompiler_memory_exceptions; + if (CodeCache::IsUsingFastmem() && !checked) + { + m_cycles += Bus::RAM_READ_TICKS; + + const Reg32 dst = dst_reg_alloc(); + + if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) + { + DebugAssert(addr_reg != RWARG3); + cg->mov(RWARG3, addr_reg.cvt32()); + cg->shr(RWARG3, Bus::FASTMEM_LUT_PAGE_SHIFT); + cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]); + } + + const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE; + u8* start = cg->getCurr(); + switch (size) + { + case MemoryAccessSize::Byte: + { + sign ? cg->movsx(dst, cg->byte[membase + addr_reg.cvt64()]) : + cg->movzx(dst, cg->byte[membase + addr_reg.cvt64()]); + } + break; + + case MemoryAccessSize::HalfWord: + { + sign ? cg->movsx(dst, cg->word[membase + addr_reg.cvt64()]) : + cg->movzx(dst, cg->word[membase + addr_reg.cvt64()]); + } + break; + + case MemoryAccessSize::Word: + { + cg->mov(dst, cg->word[membase + addr_reg.cvt64()]); + } + break; + } + + u8* end = cg->getCurr(); + while ((end - start) < BACKPATCH_JMP_SIZE) + { + cg->nop(); + end = cg->getCurr(); + } + + AddLoadStoreInfo(start, static_cast(end - start), static_cast(addr_reg.getIdx()), + static_cast(dst.getIdx()), size, sign, true); + return dst; + } + + if (addr_reg != RWARG1) + cg->mov(RWARG1, addr_reg); + + switch (size) + { + case MemoryAccessSize::Byte: + { + cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte)); + } + break; + case MemoryAccessSize::HalfWord: + { + cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord)); + } + break; + case MemoryAccessSize::Word: + { + cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord)); + } + break; + } + + // TODO: turn this into an asm function instead + if (checked) + { + cg->test(RXRET, RXRET); + + BackupHostState(); + SwitchToFarCode(true, &CodeGenerator::js); + + // flush regs, but not pc, it's going to get overwritten + // flush cycles because of the GTE instruction stuff... + Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); + + // cause_bits = (-result << 2) | BD | cop_n + cg->mov(RWARG1, RWRET); + cg->neg(RWARG1); + cg->shl(RWARG1, 2); + cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException( + static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); + cg->mov(RWARG2, m_current_instruction_pc); + cg->call(static_cast(&CPU::RaiseException)); + m_dirty_pc = false; + EndAndLinkBlock(std::nullopt, true); + + SwitchToNearCode(false); + RestoreHostState(); + } + + const Xbyak::Reg32 dst_reg = dst_reg_alloc(); + switch (size) + { + case MemoryAccessSize::Byte: + { + sign ? cg->movsx(dst_reg, RWRET.cvt8()) : cg->movzx(dst_reg, RWRET.cvt8()); + } + break; + case MemoryAccessSize::HalfWord: + { + sign ? cg->movsx(dst_reg, RWRET.cvt16()) : cg->movzx(dst_reg, RWRET.cvt16()); + } + break; + case MemoryAccessSize::Word: + { + if (dst_reg != RWRET) + cg->mov(dst_reg, RWRET); + } + break; + } + + return dst_reg; +} + +void CPU::NewRec::X64Compiler::GenerateStore(const Xbyak::Reg32& addr_reg, const Xbyak::Reg32& value_reg, + MemoryAccessSize size) +{ + const bool checked = g_settings.cpu_recompiler_memory_exceptions; + if (CodeCache::IsUsingFastmem() && !checked) + { + if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) + { + DebugAssert(addr_reg != RWARG3 && value_reg != RWARG3); + cg->mov(RWARG3, addr_reg.cvt32()); + cg->shr(RWARG3, Bus::FASTMEM_LUT_PAGE_SHIFT); + cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]); + } + + const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE; + u8* start = cg->getCurr(); + switch (size) + { + case MemoryAccessSize::Byte: + cg->mov(cg->byte[membase + addr_reg.cvt64()], value_reg.cvt8()); + break; + + case MemoryAccessSize::HalfWord: + cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt16()); + break; + + case MemoryAccessSize::Word: + cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt32()); + break; + } + + u8* end = cg->getCurr(); + while ((end - start) < BACKPATCH_JMP_SIZE) + { + cg->nop(); + end = cg->getCurr(); + } + + AddLoadStoreInfo(start, static_cast(end - start), static_cast(addr_reg.getIdx()), + static_cast(value_reg.getIdx()), size, false, false); + return; + } + + if (addr_reg != RWARG1) + cg->mov(RWARG1, addr_reg); + if (value_reg != RWARG2) + cg->mov(RWARG2, value_reg); + + switch (size) + { + case MemoryAccessSize::Byte: + { + cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte)); + } + break; + case MemoryAccessSize::HalfWord: + { + cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); + } + break; + case MemoryAccessSize::Word: + { + cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord)); + } + break; + } + + // TODO: turn this into an asm function instead + if (checked) + { + cg->test(RWRET, RWRET); + + BackupHostState(); + SwitchToFarCode(true, &CodeGenerator::jnz); + + // flush regs, but not pc, it's going to get overwritten + // flush cycles because of the GTE instruction stuff... + Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); + + // cause_bits = (result << 2) | BD | cop_n + cg->mov(RWARG1, RWRET); + cg->shl(RWARG1, 2); + cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException( + static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); + cg->mov(RWARG2, m_current_instruction_pc); + cg->call(reinterpret_cast(static_cast(&CPU::RaiseException))); + m_dirty_pc = false; + EndAndLinkBlock(std::nullopt, true); + + SwitchToNearCode(false); + RestoreHostState(); + } +} + +void CPU::NewRec::X64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + const std::optional addr_reg = g_settings.gpu_pgxp_enable ? + std::optional(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : + std::optional(); + FlushForLoadStore(address, false); + const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); + + const Reg32 data = GenerateLoad(addr, size, sign, [this, cf]() { + if (cf.MipsT() == Reg::zero) + return RWRET; + + return Reg32(AllocateHostReg(GetFlagsForNewLoadDelayedReg(), + EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT())); + }); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + + cg->mov(RWARG1, inst->bits); + cg->mov(RWARG2, addr); + cg->mov(RWARG3, data); + cg->call(s_pgxp_mem_load_functions[static_cast(size)][static_cast(sign)]); + FreeHostReg(addr_reg.value().getIdx()); + } +} + +void CPU::NewRec::X64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + DebugAssert(size == MemoryAccessSize::Word && !sign); + FlushForLoadStore(address, false); + + // TODO: if address is constant, this can be simplified.. + + // If we're coming from another block, just flush the load delay and hope for the best.. + if (m_load_delay_dirty) + UpdateLoadDelay(); + + // We'd need to be careful here if we weren't overwriting it.. + const Reg32 addr = Reg32(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP)); + ComputeLoadStoreAddressArg(cf, address, addr); + cg->mov(RWARG1, addr); + cg->and_(RWARG1, ~0x3u); + GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; }); + + if (inst->r.rt == Reg::zero) + { + FreeHostReg(addr.getIdx()); + return; + } + + // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is + // never written back. NOTE: can't trust T in cf because of the flush + const Reg rt = inst->r.rt; + Reg32 value; + if (m_load_delay_register == rt) + { + const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ? + AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) : + m_load_delay_value_register; + RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt); + value = Reg32(existing_ld_rt); + } + else + { + if constexpr (EMULATE_LOAD_DELAYS) + { + value = Reg32(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt)); + if (HasConstantReg(rt)) + cg->mov(value, GetConstantRegU32(rt)); + else if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) + cg->mov(value, Reg32(rtreg.value())); + else + cg->mov(value, MipsPtr(rt)); + } + else + { + value = Reg32(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt)); + } + } + + DebugAssert(value != cg->ecx); + cg->mov(cg->ecx, addr); + cg->and_(cg->ecx, 3); + cg->shl(cg->ecx, 3); // *8 + + // TODO for other arch: reverse subtract + DebugAssert(RWARG2 != cg->ecx); + cg->mov(RWARG2, 24); + cg->sub(RWARG2, cg->ecx); + + if (inst->op == InstructionOp::lwl) + { + // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; + // new_value = (value & mask) | (RWRET << (24 - shift)); + cg->mov(addr, 0xFFFFFFu); + cg->shr(addr, cg->cl); + cg->and_(value, addr); + cg->mov(cg->ecx, RWARG2); + cg->shl(RWRET, cg->cl); + cg->or_(value, RWRET); + } + else + { + // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); + // new_value = (value & mask) | (RWRET >> shift); + cg->shr(RWRET, cg->cl); + cg->mov(addr, 0xFFFFFF00u); + cg->mov(cg->ecx, RWARG2); + cg->shl(addr, cg->cl); + cg->and_(value, addr); + cg->or_(value, RWRET); + } + + FreeHostReg(addr.getIdx()); +} + +void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + const std::optional addr_reg = g_settings.gpu_pgxp_enable ? + std::optional(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : + std::optional(); + FlushForLoadStore(address, false); + const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); + GenerateLoad(addr, MemoryAccessSize::Word, false, []() { return RWRET; }); + + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, true); + switch (action) + { + case GTERegisterAccessAction::Ignore: + { + break; + } + + case GTERegisterAccessAction::Direct: + { + cg->mov(cg->dword[PTR(ptr)], RWRET); + break; + } + + case GTERegisterAccessAction::SignExtend16: + { + cg->movsx(RWRET, RWRET.cvt16()); + cg->mov(cg->dword[PTR(ptr)], RWRET); + break; + } + + case GTERegisterAccessAction::ZeroExtend16: + { + cg->movzx(RWRET, RWRET.cvt16()); + cg->mov(cg->dword[PTR(ptr)], RWRET); + break; + } + + case GTERegisterAccessAction::CallHandler: + { + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG2, RWRET); + cg->mov(RWARG1, index); + cg->call(>E::WriteRegister); + break; + } + + case GTERegisterAccessAction::PushFIFO: + { + // SXY0 <- SXY1 + // SXY1 <- SXY2 + // SXY2 <- SXYP + DebugAssert(RWRET != RWARG1 && RWRET != RWARG2); + cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]); + cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]); + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1); + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2); + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWRET); + break; + } + + default: + { + Panic("Unknown action"); + return; + } + } + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG3, RWRET); + cg->mov(RWARG2, addr); + cg->mov(RWARG1, inst->bits); + cg->call(reinterpret_cast(&PGXP::CPU_LWC2)); + FreeHostReg(addr_reg.value().getIdx()); + } +} + +void CPU::NewRec::X64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + const std::optional addr_reg = g_settings.gpu_pgxp_enable ? + std::optional(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : + std::optional(); + FlushForLoadStore(address, true); + const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); + const Reg32 data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2; + if (!cf.valid_host_t) + MoveTToReg(RWARG2, cf); + + GenerateStore(addr, data, size); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + MoveMIPSRegToReg(RWARG3, cf.MipsT()); + cg->mov(RWARG2, addr); + cg->mov(RWARG1, inst->bits); + cg->call(s_pgxp_mem_store_functions[static_cast(size)]); + FreeHostReg(addr_reg.value().getIdx()); + } +} + +void CPU::NewRec::X64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + DebugAssert(size == MemoryAccessSize::Word && !sign); + FlushForLoadStore(address, true); + + // TODO: if address is constant, this can be simplified.. + // We'd need to be careful here if we weren't overwriting it.. + const Reg32 addr = Reg32(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP)); + ComputeLoadStoreAddressArg(cf, address, addr); + cg->mov(RWARG1, addr); + cg->and_(RWARG1, ~0x3u); + GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; }); + + // TODO: this can take over rt's value if it's no longer needed + // NOTE: can't trust T in cf because of the flush + const Reg rt = inst->r.rt; + const Reg32 value = RWARG2; + DebugAssert(value != cg->ecx); + if (HasConstantReg(rt)) + cg->mov(value, GetConstantRegU32(rt)); + else if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) + cg->mov(value, Reg32(rtreg.value())); + else + cg->mov(value, MipsPtr(rt)); + + cg->mov(cg->ecx, addr); + cg->and_(cg->ecx, 3); + cg->shl(cg->ecx, 3); // *8 + + if (inst->op == InstructionOp::swl) + { + // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; + // new_value = (RWRET & mem_mask) | (value >> (24 - shift)); + cg->mov(RWARG3, 0xFFFFFF00u); + cg->shl(RWARG3, cg->cl); + cg->and_(RWRET, RWARG3); + + cg->mov(RWARG3, 24); + cg->sub(RWARG3, cg->ecx); + cg->mov(cg->ecx, RWARG3); + cg->shr(value, cg->cl); + cg->or_(value, RWRET); + } + else + { + // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); + // new_value = (RWRET & mem_mask) | (value << shift); + cg->shl(value, cg->cl); + + DebugAssert(RWARG3 != cg->ecx); + cg->mov(RWARG3, 24); + cg->sub(RWARG3, cg->ecx); + cg->mov(cg->ecx, RWARG3); + cg->mov(RWARG3, 0x00FFFFFFu); + cg->shr(RWARG3, cg->cl); + cg->and_(RWRET, RWARG3); + cg->or_(value, RWRET); + } + + FreeHostReg(addr.getIdx()); + + cg->mov(RWARG1, addr); + cg->and_(RWARG1, ~0x3u); + GenerateStore(RWARG1, value, MemoryAccessSize::Word); +} + +void CPU::NewRec::X64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) +{ + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, false); + switch (action) + { + case GTERegisterAccessAction::Direct: + { + cg->mov(RWARG2, cg->dword[PTR(ptr)]); + } + break; + + case GTERegisterAccessAction::CallHandler: + { + // should already be flushed.. except in fastmem case + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG1, index); + cg->call(>E::ReadRegister); + cg->mov(RWARG2, RWRET); + } + break; + + default: + { + Panic("Unknown action"); + } + break; + } + + // PGXP makes this a giant pain. + if (!g_settings.gpu_pgxp_enable) + { + FlushForLoadStore(address, true); + const Reg32 addr = ComputeLoadStoreAddressArg(cf, address); + GenerateStore(addr, RWARG2, size); + return; + } + + // TODO: This can be simplified because we don't need to validate in PGXP.. + const Reg32 addr_reg = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); + const Reg32 data_backup = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); + FlushForLoadStore(address, true); + ComputeLoadStoreAddressArg(cf, address, addr_reg); + cg->mov(data_backup, RWARG2); + GenerateStore(addr_reg, RWARG2, size); + + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG3, data_backup); + cg->mov(RWARG2, addr_reg); + cg->mov(RWARG1, inst->bits); + cg->call(reinterpret_cast(&PGXP::CPU_SWC2)); + FreeHostReg(addr_reg.getIdx()); + FreeHostReg(data_backup.getIdx()); +} + +void CPU::NewRec::X64Compiler::Compile_mtc0(CompileFlags cf) +{ + const Cop0Reg reg = static_cast(MipsD()); + const u32* ptr = GetCop0RegPtr(reg); + const u32 mask = GetCop0RegWriteMask(reg); + if (!ptr) + { + Compile_Fallback(); + return; + } + + // TODO: const apply mask + const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; + const u32 constant_value = cf.const_t ? GetConstantRegU32(cf.MipsT()) : 0; + if (mask == 0) + { + // if it's a read-only register, ignore + Log_DebugPrintf("Ignoring write to read-only cop0 reg %u", static_cast(reg)); + return; + } + + // for some registers, we need to test certain bits + const bool needs_bit_test = (reg == Cop0Reg::SR); + const Reg32 changed_bits = RWARG3; + + // update value + if (cf.valid_host_t) + { + cg->mov(RWARG1, rt); + cg->mov(RWARG2, cg->dword[PTR(ptr)]); + cg->and_(RWARG1, mask); + if (needs_bit_test) + { + cg->mov(changed_bits, RWARG2); + cg->xor_(changed_bits, RWARG1); + } + cg->and_(RWARG2, ~mask); + cg->or_(RWARG2, RWARG1); + cg->mov(cg->dword[PTR(ptr)], RWARG2); + } + else + { + cg->mov(RWARG2, cg->dword[PTR(ptr)]); + if (needs_bit_test) + { + cg->mov(changed_bits, RWARG2); + cg->xor_(changed_bits, constant_value & mask); + } + cg->and_(RWARG2, ~mask); + cg->or_(RWARG2, constant_value & mask); + cg->mov(cg->dword[PTR(ptr)], RWARG2); + } + + if (reg == Cop0Reg::SR) + { + // TODO: replace with register backup + // We could just inline the whole thing.. + Flush(FLUSH_FOR_C_CALL); + + cg->test(changed_bits, 1u << 16); + SwitchToFarCode(true, &CodeGenerator::jnz); + cg->push(RWARG1); + cg->push(RWARG2); + cg->call(&CPU::UpdateMemoryPointers); + cg->pop(RWARG2); + cg->pop(RWARG1); + cg->mov(RMEMBASE, cg->qword[PTR(&g_state.fastmem_base)]); + SwitchToNearCode(true); + } + + if (reg == Cop0Reg::SR || reg == Cop0Reg::CAUSE) + { + const Reg32 sr = + (reg == Cop0Reg::SR) ? RWARG2 : (cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]), RWARG1); + TestInterrupts(sr); + } + + if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions) + { + // TODO: DCIC handling for debug breakpoints + Log_WarningPrintf("TODO: DCIC handling for debug breakpoints"); + } +} + +void CPU::NewRec::X64Compiler::Compile_rfe(CompileFlags cf) +{ + // shift mode bits right two, preserving upper bits + static constexpr u32 mode_bits_mask = UINT32_C(0b1111); + cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]); + cg->mov(RWARG2, RWARG1); + cg->shr(RWARG2, 2); + cg->and_(RWARG1, ~mode_bits_mask); + cg->and_(RWARG2, mode_bits_mask); + cg->or_(RWARG1, RWARG2); + cg->mov(cg->dword[PTR(&g_state.cop0_regs.sr.bits)], RWARG1); + + TestInterrupts(RWARG1); +} + +void CPU::NewRec::X64Compiler::TestInterrupts(const Xbyak::Reg32& sr) +{ + // if Iec == 0 then goto no_interrupt + Label no_interrupt; + + cg->test(sr, 1); + cg->jz(no_interrupt, CodeGenerator::T_NEAR); + + // sr & cause + cg->and_(sr, cg->dword[PTR(&g_state.cop0_regs.cause.bits)]); + + // ((sr & cause) & 0xff00) == 0 goto no_interrupt + cg->test(sr, 0xFF00); + + SwitchToFarCode(true, &CodeGenerator::jnz); + BackupHostState(); + Flush(FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL); + cg->call(reinterpret_cast(&DispatchInterrupt)); + EndBlock(std::nullopt, true); + RestoreHostState(); + SwitchToNearCode(false); + + cg->L(no_interrupt); +} + +void CPU::NewRec::X64Compiler::Compile_mfc2(CompileFlags cf) +{ + const u32 index = inst->cop.Cop2Index(); + const Reg rt = inst->r.rt; + + const auto [ptr, action] = GetGTERegisterPointer(index, false); + if (action == GTERegisterAccessAction::Ignore) + return; + + u32 hreg; + if (action == GTERegisterAccessAction::Direct) + { + hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(), + EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); + cg->mov(Reg32(hreg), cg->dword[PTR(ptr)]); + } + else if (action == GTERegisterAccessAction::CallHandler) + { + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG1, index); + cg->call(>E::ReadRegister); + + hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(), + EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); + cg->mov(Reg32(hreg), RWRET); + } + else + { + Panic("Unknown action"); + return; + } + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG1, inst->bits); + cg->mov(RWARG2, Reg32(hreg)); + cg->call(reinterpret_cast(&PGXP::CPU_MFC2)); + } +} + +void CPU::NewRec::X64Compiler::Compile_mtc2(CompileFlags cf) +{ + const u32 index = inst->cop.Cop2Index(); + const auto [ptr, action] = GetGTERegisterPointer(index, true); + if (action == GTERegisterAccessAction::Ignore) + return; + + if (action == GTERegisterAccessAction::Direct) + { + if (cf.const_t) + { + cg->mov(cg->dword[PTR(ptr)], GetConstantRegU32(cf.MipsT())); + } + else if (cf.valid_host_t) + { + cg->mov(cg->dword[PTR(ptr)], CFGetRegT(cf)); + } + else + { + cg->mov(RWARG1, MipsPtr(cf.MipsT())); + cg->mov(cg->dword[PTR(ptr)], RWARG1); + } + } + else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16) + { + const bool sign = (action == GTERegisterAccessAction::SignExtend16); + if (cf.const_t) + { + const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT())); + cg->mov(cg->dword[PTR(ptr)], sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv)); + } + else if (cf.valid_host_t) + { + sign ? cg->movsx(RWARG1, Reg16(cf.host_t)) : cg->movzx(RWARG1, Reg16(cf.host_t)); + cg->mov(cg->dword[PTR(ptr)], RWARG1); + } + else + { + sign ? cg->movsx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]) : + cg->movzx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]); + cg->mov(cg->dword[PTR(ptr)], RWARG1); + } + } + else if (action == GTERegisterAccessAction::CallHandler) + { + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG1, index); + MoveTToReg(RWARG2, cf); + cg->call(>E::WriteRegister); + } + else if (action == GTERegisterAccessAction::PushFIFO) + { + // SXY0 <- SXY1 + // SXY1 <- SXY2 + // SXY2 <- SXYP + cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]); + cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]); + if (!cf.const_t && !cf.valid_host_t) + cg->mov(RWARG3, MipsPtr(cf.MipsT())); + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1); + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2); + if (cf.const_t) + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], GetConstantRegU32(cf.MipsT())); + else if (cf.valid_host_t) + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], CFGetRegT(cf)); + else + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWARG3); + } + else + { + Panic("Unknown action"); + } +} + +void CPU::NewRec::X64Compiler::Compile_cop2(CompileFlags cf) +{ + TickCount func_ticks; + GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks); + + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK); + cg->call(reinterpret_cast(func)); + + AddGTETicks(func_ticks); +} + +u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size, + TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask, + u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed, + bool is_load) +{ + CodeGenerator acg(thunk_space, thunk_code); + CodeGenerator* cg = &acg; + + static constexpr u32 GPR_SIZE = 8; + + // on win32, we need to reserve an additional 32 bytes shadow space when calling out to C +#ifdef _WIN32 + static constexpr u32 SHADOW_SIZE = 32; +#else + static constexpr u32 SHADOW_SIZE = 0; +#endif + + // save regs + u32 num_gprs = 0; + + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) + num_gprs++; + } + + const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE) + SHADOW_SIZE; + + if (stack_size > 0) + { + cg->sub(cg->rsp, stack_size); + + u32 stack_offset = SHADOW_SIZE; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) + { + cg->mov(cg->qword[cg->rsp + stack_offset], Reg64(i)); + stack_offset += GPR_SIZE; + } + } + } + + if (cycles_to_add != 0) + cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_add); + + if (address_register != static_cast(RWARG1.getIdx())) + cg->mov(RWARG1, Reg32(address_register)); + + if (!is_load) + { + if (data_register != static_cast(RWARG2.getIdx())) + cg->mov(RWARG2, Reg32(data_register)); + } + + switch (size) + { + case MemoryAccessSize::Byte: + { + cg->call(is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte)); + } + break; + case MemoryAccessSize::HalfWord: + { + cg->call(is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); + } + break; + case MemoryAccessSize::Word: + { + cg->call(is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord) : + reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord)); + } + break; + } + + if (is_load) + { + const Reg32 dst = Reg32(data_register); + switch (size) + { + case MemoryAccessSize::Byte: + { + is_signed ? cg->movsx(dst, RWRET.cvt8()) : cg->movzx(dst, RWRET.cvt8()); + } + break; + case MemoryAccessSize::HalfWord: + { + is_signed ? cg->movsx(dst, RWRET.cvt16()) : cg->movzx(dst, RWRET.cvt16()); + } + break; + case MemoryAccessSize::Word: + { + if (dst != RWRET) + cg->mov(dst, RWRET); + } + break; + } + } + + if (cycles_to_remove != 0) + cg->sub(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_remove); + + // restore regs + if (stack_size > 0) + { + u32 stack_offset = SHADOW_SIZE; + for (u32 i = 0; i < NUM_HOST_REGS; i++) + { + if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) + { + cg->mov(Reg64(i), cg->qword[cg->rsp + stack_offset]); + stack_offset += GPR_SIZE; + } + } + + cg->add(cg->rsp, stack_size); + } + + cg->jmp(static_cast(code_address) + code_size); + + // fill the rest of it with nops, if any + DebugAssert(code_size >= BACKPATCH_JMP_SIZE); + if (code_size > BACKPATCH_JMP_SIZE) + std::memset(static_cast(code_address) + BACKPATCH_JMP_SIZE, 0x90, code_size - BACKPATCH_JMP_SIZE); + + return static_cast(cg->getSize()); +} diff --git a/src/core/cpu_newrec_compiler_x64.h b/src/core/cpu_newrec_compiler_x64.h new file mode 100644 index 000000000..e9af43398 --- /dev/null +++ b/src/core/cpu_newrec_compiler_x64.h @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once +#include "cpu_newrec_compiler.h" +#include +#include + +namespace CPU::NewRec { + +class X64Compiler final : public Compiler +{ +public: + X64Compiler(); + ~X64Compiler() override; + +protected: + const char* GetHostRegName(u32 reg) const override; + + const void* GetCurrentCodePointer() override; + + void LoadHostRegWithConstant(u32 reg, u32 val) override; + void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) override; + void StoreConstantToCPUPointer(u32 val, const void* ptr) override; + void StoreHostRegToCPUPointer(u32 reg, const void* ptr) override; + void CopyHostReg(u32 dst, u32 src) override; + + void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer, + u32 far_code_space) override; + void BeginBlock() override; + void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) override; + void GenerateICacheCheckAndUpdate() override; + void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) override; + void EndBlock(const std::optional& newpc, bool do_event_test) override; + void EndBlockWithException(Exception excode) override; + void EndAndLinkBlock(const std::optional& newpc, bool do_event_test); + const void* EndCompile(u32* code_size, u32* far_code_size) override; + + void Flush(u32 flags) override; + + void Compile_Fallback() override; + + void CheckBranchTarget(const Xbyak::Reg32& pcreg); + void Compile_jr(CompileFlags cf) override; + void Compile_jalr(CompileFlags cf) override; + void Compile_bxx(CompileFlags cf, BranchCondition cond) override; + + void Compile_addi(CompileFlags cf) override; + void Compile_addiu(CompileFlags cf) override; + void Compile_slti(CompileFlags cf, bool sign); + void Compile_slti(CompileFlags cf) override; + void Compile_sltiu(CompileFlags cf) override; + void Compile_andi(CompileFlags cf) override; + void Compile_ori(CompileFlags cf) override; + void Compile_xori(CompileFlags cf) override; + + void Compile_sll(CompileFlags cf) override; + void Compile_srl(CompileFlags cf) override; + void Compile_sra(CompileFlags cf) override; + void Compile_variable_shift(CompileFlags cf, + void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Reg8&), + void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, int)); + void Compile_sllv(CompileFlags cf) override; + void Compile_srlv(CompileFlags cf) override; + void Compile_srav(CompileFlags cf) override; + void Compile_mult(CompileFlags cf, bool sign); + void Compile_mult(CompileFlags cf) override; + void Compile_multu(CompileFlags cf) override; + void Compile_div(CompileFlags cf) override; + void Compile_divu(CompileFlags cf) override; + void TestOverflow(const Xbyak::Reg32& result); + void Compile_dst_op(CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&), + void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, u32), bool commutative, + bool overflow); + void Compile_add(CompileFlags cf) override; + void Compile_addu(CompileFlags cf) override; + void Compile_sub(CompileFlags cf) override; + void Compile_subu(CompileFlags cf) override; + void Compile_and(CompileFlags cf) override; + void Compile_or(CompileFlags cf) override; + void Compile_xor(CompileFlags cf) override; + void Compile_nor(CompileFlags cf) override; + void Compile_slt(CompileFlags cf, bool sign); + void Compile_slt(CompileFlags cf) override; + void Compile_sltu(CompileFlags cf) override; + + Xbyak::Reg32 ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional& address, + const std::optional& reg = std::nullopt); + template + Xbyak::Reg32 GenerateLoad(const Xbyak::Reg32& addr_reg, MemoryAccessSize size, bool sign, + const RegAllocFn& dst_reg_alloc); + void GenerateStore(const Xbyak::Reg32& addr_reg, const Xbyak::Reg32& value_reg, MemoryAccessSize size); + void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, + const std::optional& address) override; + + void TestInterrupts(const Xbyak::Reg32& sr); + void Compile_mtc0(CompileFlags cf) override; + void Compile_rfe(CompileFlags cf) override; + + void Compile_mfc2(CompileFlags cf) override; + void Compile_mtc2(CompileFlags cf) override; + void Compile_cop2(CompileFlags cf) override; + + void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count, + Reg arg3reg = Reg::count) override; + +private: + void SwitchToFarCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*) = nullptr); + void SwitchToNearCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*) = nullptr); + + Xbyak::Address MipsPtr(Reg r) const; + Xbyak::Reg32 CFGetRegD(CompileFlags cf) const; + Xbyak::Reg32 CFGetRegS(CompileFlags cf) const; + Xbyak::Reg32 CFGetRegT(CompileFlags cf) const; + Xbyak::Reg32 CFGetRegLO(CompileFlags cf) const; + Xbyak::Reg32 CFGetRegHI(CompileFlags cf) const; + + Xbyak::Reg32 MoveSToD(CompileFlags cf); + Xbyak::Reg32 MoveSToT(CompileFlags cf); + Xbyak::Reg32 MoveTToD(CompileFlags cf); + void MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf); + void MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf); + void MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg); + + std::unique_ptr m_emitter; + std::unique_ptr m_far_emitter; + Xbyak::CodeGenerator* cg; +}; + +} // namespace CPU::NewRec diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index b5114237a..69212680f 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -46,7 +46,8 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) constexpr u32 stack_size = 8; #endif - DebugAssert(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler); + DebugAssert(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler || + g_settings.cpu_execution_mode == CPUExecutionMode::NewRec); CodeGenerator acg(code_size, static_cast(code)); CodeGenerator* cg = &acg; diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h index 58273691c..1b89317a8 100644 --- a/src/core/cpu_recompiler_types.h +++ b/src/core/cpu_recompiler_types.h @@ -6,6 +6,8 @@ #pragma once #include "cpu_types.h" +#include + #if defined(CPU_ARCH_X64) // We need to include windows.h before xbyak does.. @@ -130,4 +132,39 @@ u8* armGetJumpTrampoline(const void* target); } // namespace CPU::Recompiler +#elif defined(CPU_ARCH_RISCV64) + +#include "biscuit/assembler.hpp" + +namespace CPU::Recompiler { + +// A reasonable "maximum" number of bytes per instruction. +constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64; +constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128; + +#define RRET biscuit::a0 +#define RARG1 biscuit::a0 +#define RARG2 biscuit::a1 +#define RARG3 biscuit::a2 +#define RSCRATCH biscuit::t6 +#define RSTATE biscuit::s10 +#define RMEMBASE biscuit::s11 + +bool rvIsCallerSavedRegister(u32 id); +bool rvIsValidSExtITypeImm(u32 imm); +std::pair rvGetAddressImmediates(const void* cur, const void* target); +void rvMoveAddressToReg(biscuit::Assembler* armAsm, const biscuit::GPR& reg, const void* addr); +void rvEmitMov(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, u32 imm); +void rvEmitMov64(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& scratch, u64 imm); +u32 rvEmitJmp(biscuit::Assembler* armAsm, const void* ptr, const biscuit::GPR& link_reg = biscuit::zero); +u32 rvEmitCall(biscuit::Assembler* armAsm, const void* ptr); +void rvEmitSExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> word +void rvEmitUExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> word +void rvEmitSExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> word +void rvEmitUExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> word +void rvEmitDSExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> doubleword +void rvEmitDUExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> doubleword + +} // namespace CPU::Recompiler + #endif diff --git a/src/core/imgui_overlays.cpp b/src/core/imgui_overlays.cpp index a52b9a5c5..050d100af 100644 --- a/src/core/imgui_overlays.cpp +++ b/src/core/imgui_overlays.cpp @@ -370,6 +370,11 @@ void ImGuiManager::DrawPerformanceOverlay() text.append_fmt("{}{}", first ? "" : "/", "CI"); first = false; } + else if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec) + { + text.append_fmt("{}{}", first ? "" : "/", "NR"); + first = false; + } else { if (g_settings.cpu_recompiler_icache) diff --git a/src/core/settings.cpp b/src/core/settings.cpp index 100824d08..99575e6f1 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -834,11 +834,13 @@ const char* Settings::GetDiscRegionDisplayName(DiscRegion region) return Host::TranslateToCString("DiscRegion", s_disc_region_display_names[static_cast(region)]); } -static constexpr const std::array s_cpu_execution_mode_names = {"Interpreter", "CachedInterpreter", "Recompiler"}; +static constexpr const std::array s_cpu_execution_mode_names = {"Interpreter", "CachedInterpreter", "Recompiler", + "NewRec"}; static constexpr const std::array s_cpu_execution_mode_display_names = { TRANSLATE_NOOP("CPUExecutionMode", "Interpreter (Slowest)"), TRANSLATE_NOOP("CPUExecutionMode", "Cached Interpreter (Faster)"), - TRANSLATE_NOOP("CPUExecutionMode", "Recompiler (Fastest)")}; + TRANSLATE_NOOP("CPUExecutionMode", "Recompiler (Fastest)"), + TRANSLATE_NOOP("CPUExecutionMode", "New Recompiler (Experimental)")}; std::optional Settings::ParseCPUExecutionMode(const char* str) { diff --git a/src/core/settings.h b/src/core/settings.h index 7fc54a3fb..b7d3c78f6 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -414,7 +414,7 @@ struct Settings static constexpr float DEFAULT_GPU_PGXP_DEPTH_THRESHOLD = 300.0f; static constexpr float GPU_PGXP_DEPTH_THRESHOLD_SCALE = 4096.0f; -#ifdef ENABLE_RECOMPILER +#if defined(ENABLE_RECOMPILER) static constexpr CPUExecutionMode DEFAULT_CPU_EXECUTION_MODE = CPUExecutionMode::Recompiler; // LUT still ends up faster on Apple Silicon for now, because of 16K pages. @@ -423,6 +423,9 @@ struct Settings #else static constexpr CPUFastmemMode DEFAULT_CPU_FASTMEM_MODE = CPUFastmemMode::LUT; #endif +#elif defined(ENABLE_NEWREC) + static constexpr CPUExecutionMode DEFAULT_CPU_EXECUTION_MODE = CPUExecutionMode::NewRec; + static constexpr CPUFastmemMode DEFAULT_CPU_FASTMEM_MODE = CPUFastmemMode::MMap; #else static constexpr CPUExecutionMode DEFAULT_CPU_EXECUTION_MODE = CPUExecutionMode::CachedInterpreter; static constexpr CPUFastmemMode DEFAULT_CPU_FASTMEM_MODE = CPUFastmemMode::Disabled; diff --git a/src/core/system.cpp b/src/core/system.cpp index 4fd0736cf..44a5e77a1 100644 --- a/src/core/system.cpp +++ b/src/core/system.cpp @@ -3532,7 +3532,7 @@ void System::CheckForSettingsChanges(const Settings& old_settings) CPU::ClearICache(); } - if (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler && + if (CPU::CodeCache::IsUsingAnyRecompiler() && (g_settings.cpu_recompiler_memory_exceptions != old_settings.cpu_recompiler_memory_exceptions || g_settings.cpu_recompiler_block_linking != old_settings.cpu_recompiler_block_linking || g_settings.cpu_recompiler_icache != old_settings.cpu_recompiler_icache || diff --git a/src/core/types.h b/src/core/types.h index 1732263b4..506784e11 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -46,6 +46,7 @@ enum class CPUExecutionMode : u8 Interpreter, CachedInterpreter, Recompiler, + NewRec, Count };