// SPDX-FileCopyrightText: 2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "cpu_newrec_compiler_x64.h" #include "common/align.h" #include "common/assert.h" #include "common/log.h" #include "common/string_util.h" #include "cpu_code_cache_private.h" #include "cpu_core_private.h" #include "cpu_recompiler_thunks.h" #include "cpu_recompiler_types.h" #include "gte.h" #include "pgxp.h" #include "settings.h" #include "timing_event.h" #include Log_SetChannel(CPU::NewRec); #define RMEMBASE cg->rbx #define RSTATE cg->rbp // #define PTR(x) (cg->rip + (x)) #define PTR(x) (RSTATE + (u32)(((u8*)(x)) - ((u8*)&g_state))) // PGXP TODO: LWL etc, MFC0 // PGXP TODO: Spyro 1 level gates have issues. static constexpr u32 BACKPATCH_JMP_SIZE = 5; using namespace Xbyak; using CPU::Recompiler::IsCallerSavedRegister; // TODO: try using a pointer to state instead of rip-relative.. it might end up faster due to smaller code namespace CPU::NewRec { X64Compiler s_instance; Compiler* g_compiler = &s_instance; } // namespace CPU::NewRec CPU::NewRec::X64Compiler::X64Compiler() = default; CPU::NewRec::X64Compiler::~X64Compiler() = default; void CPU::NewRec::X64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer, u32 far_code_space) { Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space); // TODO: don't recreate this every time.. DebugAssert(!m_emitter && !m_far_emitter && !cg); m_emitter = std::make_unique(code_buffer_space, code_buffer); m_far_emitter = std::make_unique(far_code_space, far_code_buffer); cg = m_emitter.get(); // Need to wipe it out so it's correct when toggling fastmem. m_host_regs = {}; const u32 membase_idx = CodeCache::IsUsingFastmem() ? static_cast(RMEMBASE.getIdx()) : NUM_HOST_REGS; const u32 cpu_idx = static_cast(RSTATE.getIdx()); for (u32 i = 0; i < NUM_HOST_REGS; i++) { HostRegAlloc& ra = m_host_regs[i]; if (i == static_cast(RWRET.getIdx()) || i == static_cast(RWARG1.getIdx()) || i == static_cast(RWARG2.getIdx()) || i == static_cast(RWARG3.getIdx()) || i == static_cast(cg->rsp.getIdx()) || i == cpu_idx || i == membase_idx || i == static_cast(cg->ecx.getIdx()) /* keep ecx free for shifts, maybe use BMI? */) { continue; } ra.flags = HR_USABLE | (IsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED); } } void CPU::NewRec::X64Compiler::SwitchToFarCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*)) { DebugAssert(cg == m_emitter.get()); if (emit_jump) { const void* fcptr = m_far_emitter->getCurr(); (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr); } cg = m_far_emitter.get(); } void CPU::NewRec::X64Compiler::SwitchToNearCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*)) { DebugAssert(cg == m_far_emitter.get()); if (emit_jump) { const void* fcptr = m_emitter->getCurr(); (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr); } cg = m_emitter.get(); } void CPU::NewRec::X64Compiler::BeginBlock() { Compiler::BeginBlock(); #if 0 if (m_block->pc == 0xBFC06F0C) { //__debugbreak(); cg->db(0xcc); } #endif #if 0 cg->nop(); cg->mov(RWARG1, m_block->pc); cg->nop(); #endif } void CPU::NewRec::X64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) { // store it first to reduce code size, because we can offset cg->mov(RXARG1, static_cast(reinterpret_cast(ram_ptr))); cg->mov(RXARG2, static_cast(reinterpret_cast(shadow_ptr))); bool first = true; u32 offset = 0; while (size >= 16) { const Xbyak::Xmm& dst = first ? cg->xmm0 : cg->xmm1; cg->movups(dst, cg->xword[RXARG1 + offset]); cg->pcmpeqd(dst, cg->xword[RXARG2 + offset]); if (!first) cg->pand(cg->xmm0, dst); else first = false; offset += 16; size -= 16; } // TODO: better codegen for 16 byte aligned blocks if (!first) { cg->movmskps(cg->eax, cg->xmm0); cg->cmp(cg->eax, 0xf); cg->jne(CodeCache::g_discard_and_recompile_block); } while (size >= 8) { cg->mov(RXARG3, cg->qword[RXARG1 + offset]); cg->cmp(RXARG3, cg->qword[RXARG2 + offset]); cg->jne(CodeCache::g_discard_and_recompile_block); offset += 8; size -= 8; } while (size >= 4) { cg->mov(RWARG3, cg->dword[RXARG1 + offset]); cg->cmp(RWARG3, cg->dword[RXARG2 + offset]); cg->jne(CodeCache::g_discard_and_recompile_block); offset += 4; size -= 4; } DebugAssert(size == 0); } void CPU::NewRec::X64Compiler::GenerateICacheCheckAndUpdate() { if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1) { cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast(m_block->uncached_fetch_ticks)); } else if (m_block->icache_line_count > 0) { cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]); // TODO: Vectorize this... VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK; for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE) { const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc); const TickCount fill_ticks = GetICacheFillTicks(current_pc); if (fill_ticks <= 0) continue; const u32 line = GetICacheLine(current_pc); const u32 offset = (line * sizeof(u32)); Xbyak::Label cache_hit; cg->cmp(cg->dword[RXARG1 + offset], tag); cg->je(cache_hit); cg->mov(cg->dword[RXARG1 + offset], tag); cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast(fill_ticks)); cg->L(cache_hit); } } } void CPU::NewRec::X64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/, s32 arg3reg /*= -1*/) { if (arg1reg >= 0 && arg1reg != static_cast(RXARG1.getIdx())) cg->mov(RXARG1, Reg64(arg1reg)); if (arg1reg >= 0 && arg2reg != static_cast(RXARG2.getIdx())) cg->mov(RXARG2, Reg64(arg2reg)); if (arg1reg >= 0 && arg3reg != static_cast(RXARG3.getIdx())) cg->mov(RXARG3, Reg64(arg3reg)); cg->call(func); } void CPU::NewRec::X64Compiler::EndBlock(const std::optional& newpc, bool do_event_test) { if (newpc.has_value()) { if (m_dirty_pc || m_compiler_pc != newpc) cg->mov(cg->dword[PTR(&g_state.pc)], newpc.value()); } m_dirty_pc = false; // flush regs Flush(FLUSH_END_BLOCK); EndAndLinkBlock(newpc, do_event_test, false); } void CPU::NewRec::X64Compiler::EndBlockWithException(Exception excode) { // flush regs, but not pc, it's going to get overwritten // flush cycles because of the GTE instruction stuff... Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL); // TODO: flush load delay // TODO: break for pcdrv cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); cg->mov(RWARG2, m_current_instruction_pc); cg->call(static_cast(&CPU::RaiseException)); m_dirty_pc = false; EndAndLinkBlock(std::nullopt, true, false); } void CPU::NewRec::X64Compiler::EndAndLinkBlock(const std::optional& newpc, bool do_event_test, bool force_run_events) { // event test // pc should've been flushed DebugAssert(!m_dirty_pc && !m_block_ended); m_block_ended = true; // TODO: try extracting this to a function // save cycles for event test const TickCount cycles = std::exchange(m_cycles, 0); // fast path when not doing an event test if (!do_event_test && m_gte_done_cycle <= cycles) { if (cycles == 1) cg->inc(cg->dword[PTR(&g_state.pending_ticks)]); else if (cycles > 0) cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles); if (force_run_events) { cg->jmp(CodeCache::g_run_events_and_dispatch); return; } } else { // pending_ticks += cycles // if (pending_ticks >= downcount) { dispatch_event(); } if (do_event_test || cycles > 0 || m_gte_done_cycle > cycles) cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); if (cycles > 0) cg->add(RWARG1, cycles); if (m_gte_done_cycle > cycles) { cg->mov(RWARG2, RWARG1); ((m_gte_done_cycle - cycles) == 1) ? cg->inc(RWARG2) : cg->add(RWARG2, m_gte_done_cycle - cycles); cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG2); } if (do_event_test) cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]); if (cycles > 0) cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); if (do_event_test) cg->jge(CodeCache::g_run_events_and_dispatch); } // jump to dispatcher or next block if (!newpc.has_value()) { cg->jmp(CodeCache::g_dispatcher); } else { if (newpc.value() == m_block->pc) { // Special case: ourselves! No need to backlink then. Log_DebugPrintf("Linking block at %08X to self", m_block->pc); cg->jmp(cg->getCode()); } else { const void* target = CodeCache::CreateBlockLink(m_block, cg->getCurr(), newpc.value()); cg->jmp(target, CodeGenerator::T_NEAR); } } } const void* CPU::NewRec::X64Compiler::EndCompile(u32* code_size, u32* far_code_size) { const void* code = m_emitter->getCode(); *code_size = static_cast(m_emitter->getSize()); *far_code_size = static_cast(m_far_emitter->getSize()); cg = nullptr; m_far_emitter.reset(); m_emitter.reset(); return code; } const void* CPU::NewRec::X64Compiler::GetCurrentCodePointer() { return cg->getCurr(); } const char* CPU::NewRec::X64Compiler::GetHostRegName(u32 reg) const { static constexpr std::array reg64_names = { {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"}}; return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN"; } void CPU::NewRec::X64Compiler::LoadHostRegWithConstant(u32 reg, u32 val) { cg->mov(Reg32(reg), val); } void CPU::NewRec::X64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr) { cg->mov(Reg32(reg), cg->dword[PTR(ptr)]); } void CPU::NewRec::X64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr) { cg->mov(cg->dword[PTR(ptr)], Reg32(reg)); } void CPU::NewRec::X64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr) { cg->mov(cg->dword[PTR(ptr)], val); } void CPU::NewRec::X64Compiler::CopyHostReg(u32 dst, u32 src) { if (src != dst) cg->mov(Reg32(dst), Reg32(src)); } Xbyak::Address CPU::NewRec::X64Compiler::MipsPtr(Reg r) const { DebugAssert(r < Reg::count); return cg->dword[PTR(&g_state.regs.r[static_cast(r)])]; } Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegD(CompileFlags cf) const { DebugAssert(cf.valid_host_d); return Reg32(cf.host_d); } Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegS(CompileFlags cf) const { DebugAssert(cf.valid_host_s); return Reg32(cf.host_s); } Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegT(CompileFlags cf) const { DebugAssert(cf.valid_host_t); return Reg32(cf.host_t); } Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegLO(CompileFlags cf) const { DebugAssert(cf.valid_host_lo); return Reg32(cf.host_lo); } Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegHI(CompileFlags cf) const { DebugAssert(cf.valid_host_hi); return Reg32(cf.host_hi); } Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToD(CompileFlags cf) { DebugAssert(cf.valid_host_d); DebugAssert(!cf.valid_host_t || cf.host_t != cf.host_d); const Reg32 rd = CFGetRegD(cf); MoveSToReg(rd, cf); return rd; } Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToT(CompileFlags cf) { DebugAssert(cf.valid_host_t); const Reg32 rt = CFGetRegT(cf); if (cf.valid_host_s) { const Reg32 rs = CFGetRegS(cf); if (rt != rs) cg->mov(rt, rs); } else if (cf.const_s) { if (const u32 cv = GetConstantRegU32(cf.MipsS()); cv != 0) cg->mov(rt, cv); else cg->xor_(rt, rt); } else { cg->mov(rt, MipsPtr(cf.MipsS())); } return rt; } Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveTToD(CompileFlags cf) { DebugAssert(cf.valid_host_d); DebugAssert(!cf.valid_host_s || cf.host_s != cf.host_d); const Reg32 rd = CFGetRegD(cf); MoveTToReg(rd, cf); return rd; } void CPU::NewRec::X64Compiler::MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf) { if (cf.valid_host_s) { if (cf.host_s != static_cast(dst.getIdx())) cg->mov(dst, Reg32(cf.host_s)); } else if (cf.const_s) { const u32 cv = GetConstantRegU32(cf.MipsS()); if (cv == 0) cg->xor_(dst, dst); else cg->mov(dst, cv); } else { cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_s])]); } } void CPU::NewRec::X64Compiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf) { if (cf.valid_host_t) { if (cf.host_t != static_cast(dst.getIdx())) cg->mov(dst, Reg32(cf.host_t)); } else if (cf.const_t) { const u32 cv = GetConstantRegU32(cf.MipsT()); if (cv == 0) cg->xor_(dst, dst); else cg->mov(dst, cv); } else { cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_t])]); } } void CPU::NewRec::X64Compiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg) { DebugAssert(reg < Reg::count); if (const std::optional hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg)) cg->mov(dst, Reg32(hreg.value())); else if (HasConstantReg(reg)) cg->mov(dst, GetConstantRegU32(reg)); else cg->mov(dst, MipsPtr(reg)); } void CPU::NewRec::X64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */, Reg arg3reg /* = Reg::count */) { DebugAssert(g_settings.gpu_pgxp_enable); Flush(FLUSH_FOR_C_CALL); if (arg2reg != Reg::count) MoveMIPSRegToReg(RWARG2, arg2reg); if (arg3reg != Reg::count) MoveMIPSRegToReg(RWARG3, arg3reg); cg->mov(RWARG1, arg1val); cg->call(func); } void CPU::NewRec::X64Compiler::Flush(u32 flags) { Compiler::Flush(flags); if (flags & FLUSH_PC && m_dirty_pc) { cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc); m_dirty_pc = false; } if (flags & FLUSH_INSTRUCTION_BITS) { cg->mov(cg->dword[PTR(&g_state.current_instruction.bits)], inst->bits); cg->mov(cg->dword[PTR(&g_state.current_instruction_pc)], m_current_instruction_pc); cg->mov(cg->byte[PTR(&g_state.current_instruction_in_branch_delay_slot)], m_current_instruction_branch_delay_slot); } if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty) { // This sucks :( // TODO: make it a function? cg->movzx(RWARG1, cg->byte[PTR(&g_state.load_delay_reg)]); cg->mov(RWARG2, cg->dword[PTR(&g_state.load_delay_value)]); cg->mov(cg->dword[PTR(&g_state.regs.r[0]) + RXARG1 * 4], RWARG2); cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast(Reg::count)); m_load_delay_dirty = false; } if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count) { if (m_load_delay_value_register != NUM_HOST_REGS) FreeHostReg(m_load_delay_value_register); cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast(m_load_delay_register)); m_load_delay_register = Reg::count; m_load_delay_dirty = true; } if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle) { // May as well flush cycles while we're here. // GTE spanning blocks is very rare, we _could_ disable this for speed. cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_completion_tick)]); if (m_cycles > 0) { (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles); m_cycles = 0; } cg->cmp(RWARG2, RWARG1); cg->cmova(RWARG1, RWARG2); cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); m_dirty_gte_done_cycle = false; } if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles) { cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); // update cycles at the same time if (flags & FLUSH_CYCLES && m_cycles > 0) { (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles); cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1); m_gte_done_cycle -= m_cycles; m_cycles = 0; } (m_gte_done_cycle == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_gte_done_cycle); cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG1); m_gte_done_cycle = 0; m_dirty_gte_done_cycle = true; } if (flags & FLUSH_CYCLES && m_cycles > 0) { (m_cycles == 1) ? cg->inc(cg->dword[PTR(&g_state.pending_ticks)]) : cg->add(cg->dword[PTR(&g_state.pending_ticks)], m_cycles); m_gte_done_cycle = std::max(m_gte_done_cycle - m_cycles, 0); m_cycles = 0; } } void CPU::NewRec::X64Compiler::Compile_Fallback() { Flush(FLUSH_FOR_INTERPRETER); cg->call(&CPU::Recompiler::Thunks::InterpretInstruction); // TODO: make me less garbage // TODO: this is wrong, it flushes the load delay on the same cycle when we return. // but nothing should be going through here.. Label no_load_delay; cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]); cg->cmp(RWARG1, static_cast(Reg::count)); cg->je(no_load_delay, CodeGenerator::T_SHORT); cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]); cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1); cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2); cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast(Reg::count)); cg->L(no_load_delay); m_load_delay_dirty = EMULATE_LOAD_DELAYS; } void CPU::NewRec::X64Compiler::CheckBranchTarget(const Xbyak::Reg32& pcreg) { if (!g_settings.cpu_recompiler_memory_exceptions) return; cg->test(pcreg, 0x3); SwitchToFarCode(true, &CodeGenerator::jnz); BackupHostState(); EndBlockWithException(Exception::AdEL); RestoreHostState(); SwitchToNearCode(false); } void CPU::NewRec::X64Compiler::Compile_jr(CompileFlags cf) { if (!cf.valid_host_s) cg->mov(RWARG1, MipsPtr(cf.MipsS())); const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; CheckBranchTarget(pcreg); cg->mov(cg->dword[PTR(&g_state.pc)], pcreg); CompileBranchDelaySlot(false); EndBlock(std::nullopt, true); } void CPU::NewRec::X64Compiler::Compile_jalr(CompileFlags cf) { if (!cf.valid_host_s) cg->mov(RWARG1, MipsPtr(cf.MipsS())); const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; if (MipsD() != Reg::zero) SetConstantReg(MipsD(), GetBranchReturnAddress(cf)); CheckBranchTarget(pcreg); cg->mov(cg->dword[PTR(&g_state.pc)], pcreg); CompileBranchDelaySlot(false); EndBlock(std::nullopt, true); } void CPU::NewRec::X64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond) { const u32 taken_pc = GetConditionalBranchTarget(cf); Flush(FLUSH_FOR_BRANCH); DebugAssert(cf.valid_host_s); // MipsT() here should equal zero for zero branches. DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero); // TODO: Swap this back to near once instructions don't blow up constexpr CodeGenerator::LabelType type = CodeGenerator::T_NEAR; Label taken; switch (cond) { case BranchCondition::Equal: case BranchCondition::NotEqual: { // we should always have S, maybe not T // TODO: if it's zero, we can just do test rs, rs if (cf.valid_host_t) cg->cmp(CFGetRegS(cf), CFGetRegT(cf)); else if (cf.const_t) cg->cmp(CFGetRegS(cf), GetConstantRegU32(cf.MipsT())); else cg->cmp(CFGetRegS(cf), MipsPtr(cf.MipsT())); (cond == BranchCondition::Equal) ? cg->je(taken, type) : cg->jne(taken, type); } break; case BranchCondition::GreaterThanZero: { cg->cmp(CFGetRegS(cf), 0); cg->jg(taken, type); } break; case BranchCondition::GreaterEqualZero: { cg->test(CFGetRegS(cf), CFGetRegS(cf)); cg->jns(taken, type); } break; case BranchCondition::LessThanZero: { cg->test(CFGetRegS(cf), CFGetRegS(cf)); cg->js(taken, type); } break; case BranchCondition::LessEqualZero: { cg->cmp(CFGetRegS(cf), 0); cg->jle(taken, type); } break; } BackupHostState(); if (!cf.delay_slot_swapped) CompileBranchDelaySlot(); EndBlock(m_compiler_pc, true); cg->L(taken); RestoreHostState(); if (!cf.delay_slot_swapped) CompileBranchDelaySlot(); EndBlock(taken_pc, true); } void CPU::NewRec::X64Compiler::Compile_addi(CompileFlags cf) { const Reg32 rt = MoveSToT(cf); if (const u32 imm = inst->i.imm_sext32(); imm != 0) { cg->add(rt, imm); if (g_settings.cpu_recompiler_memory_exceptions) { DebugAssert(cf.valid_host_t); TestOverflow(rt); } } } void CPU::NewRec::X64Compiler::Compile_addiu(CompileFlags cf) { const Reg32 rt = MoveSToT(cf); if (const u32 imm = inst->i.imm_sext32(); imm != 0) cg->add(rt, imm); } void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf) { Compile_slti(cf, true); } void CPU::NewRec::X64Compiler::Compile_sltiu(CompileFlags cf) { Compile_slti(cf, false); } void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf, bool sign) { const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; // Case where T == S, can't use xor because it changes flags if (!cf.valid_host_t || !cf.valid_host_s || cf.host_t != cf.host_s) cg->xor_(rt, rt); if (cf.valid_host_s) cg->cmp(CFGetRegS(cf), inst->i.imm_sext32()); else cg->cmp(MipsPtr(cf.MipsS()), inst->i.imm_sext32()); if (cf.valid_host_t && cf.valid_host_s && cf.host_t == cf.host_s) cg->mov(rt, 0); sign ? cg->setl(rt.cvt8()) : cg->setb(rt.cvt8()); if (!cf.valid_host_t) cg->mov(MipsPtr(cf.MipsT()), rt); } void CPU::NewRec::X64Compiler::Compile_andi(CompileFlags cf) { if (const u32 imm = inst->i.imm_zext32(); imm != 0) { const Reg32 rt = MoveSToT(cf); cg->and_(rt, imm); } else { const Reg32 rt = CFGetRegT(cf); cg->xor_(rt, rt); } } void CPU::NewRec::X64Compiler::Compile_ori(CompileFlags cf) { const Reg32 rt = MoveSToT(cf); if (const u32 imm = inst->i.imm_zext32(); imm != 0) cg->or_(rt, imm); } void CPU::NewRec::X64Compiler::Compile_xori(CompileFlags cf) { const Reg32 rt = MoveSToT(cf); if (const u32 imm = inst->i.imm_zext32(); imm != 0) cg->xor_(rt, imm); } void CPU::NewRec::X64Compiler::Compile_sll(CompileFlags cf) { const Reg32 rd = MoveTToD(cf); if (inst->r.shamt > 0) cg->shl(rd, inst->r.shamt); } void CPU::NewRec::X64Compiler::Compile_srl(CompileFlags cf) { const Reg32 rd = MoveTToD(cf); if (inst->r.shamt > 0) cg->shr(rd, inst->r.shamt); } void CPU::NewRec::X64Compiler::Compile_sra(CompileFlags cf) { const Reg32 rd = MoveTToD(cf); if (inst->r.shamt > 0) cg->sar(rd, inst->r.shamt); } void CPU::NewRec::X64Compiler::Compile_variable_shift( CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Reg8&), void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, int)) { const Reg32 rd = CFGetRegD(cf); if (!cf.const_s) { MoveSToReg(cg->ecx, cf); MoveTToReg(rd, cf); (cg->*op)(rd, cg->cl); } else { MoveTToReg(rd, cf); (cg->*op_const)(rd, GetConstantRegU32(cf.MipsS())); } } void CPU::NewRec::X64Compiler::Compile_sllv(CompileFlags cf) { Compile_variable_shift(cf, &CodeGenerator::shl, &CodeGenerator::shl); } void CPU::NewRec::X64Compiler::Compile_srlv(CompileFlags cf) { Compile_variable_shift(cf, &CodeGenerator::shr, &CodeGenerator::shr); } void CPU::NewRec::X64Compiler::Compile_srav(CompileFlags cf) { Compile_variable_shift(cf, &CodeGenerator::sar, &CodeGenerator::sar); } void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf, bool sign) { // RAX/RDX shouldn't be allocatable.. DebugAssert(!(m_host_regs[Xbyak::Operand::RAX].flags & HR_USABLE) && !(m_host_regs[Xbyak::Operand::RDX].flags & HR_USABLE)); MoveSToReg(cg->eax, cf); if (cf.valid_host_t) { sign ? cg->imul(CFGetRegT(cf)) : cg->mul(CFGetRegT(cf)); } else if (cf.const_t) { cg->mov(cg->edx, GetConstantRegU32(cf.MipsT())); sign ? cg->imul(cg->edx) : cg->mul(cg->edx); } else { sign ? cg->imul(MipsPtr(cf.MipsT())) : cg->mul(MipsPtr(cf.MipsT())); } // TODO: skip writeback if it's not needed if (cf.valid_host_lo) cg->mov(CFGetRegLO(cf), cg->eax); else cg->mov(MipsPtr(Reg::lo), cg->eax); if (cf.valid_host_lo) cg->mov(CFGetRegHI(cf), cg->edx); else cg->mov(MipsPtr(Reg::hi), cg->edx); } void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf) { Compile_mult(cf, true); } void CPU::NewRec::X64Compiler::Compile_multu(CompileFlags cf) { Compile_mult(cf, false); } void CPU::NewRec::X64Compiler::Compile_div(CompileFlags cf) { // not supported without registers for now.. DebugAssert(cf.valid_host_lo && cf.valid_host_hi); const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx; if (!cf.valid_host_t) MoveTToReg(rt, cf); const Reg32 rlo = CFGetRegLO(cf); const Reg32 rhi = CFGetRegHI(cf); MoveSToReg(cg->eax, cf); cg->cdq(); Label done; Label not_divide_by_zero; cg->test(rt, rt); cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT); cg->test(cg->eax, cg->eax); cg->mov(rhi, cg->eax); // hi = num cg->mov(rlo, 1); cg->mov(cg->eax, static_cast(-1)); cg->cmovns(rlo, cg->eax); // lo = s >= 0 ? -1 : 1 cg->jmp(done, CodeGenerator::T_SHORT); cg->L(not_divide_by_zero); Label not_unrepresentable; cg->cmp(cg->eax, 0x80000000u); cg->jne(not_unrepresentable, CodeGenerator::T_SHORT); cg->cmp(rt, static_cast(-1)); cg->jne(not_unrepresentable, CodeGenerator::T_SHORT); cg->mov(rlo, 0x80000000u); cg->xor_(rhi, rhi); cg->jmp(done, CodeGenerator::T_SHORT); cg->L(not_unrepresentable); cg->idiv(rt); cg->mov(rlo, cg->eax); cg->mov(rhi, cg->edx); cg->L(done); } void CPU::NewRec::X64Compiler::Compile_divu(CompileFlags cf) { // not supported without registers for now.. DebugAssert(cf.valid_host_lo && cf.valid_host_hi); const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx; if (!cf.valid_host_t) MoveTToReg(rt, cf); const Reg32 rlo = CFGetRegLO(cf); const Reg32 rhi = CFGetRegHI(cf); MoveSToReg(cg->eax, cf); cg->xor_(cg->edx, cg->edx); Label done; Label not_divide_by_zero; cg->test(rt, rt); cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT); cg->mov(rlo, static_cast(-1)); cg->mov(rhi, cg->eax); cg->jmp(done, CodeGenerator::T_SHORT); cg->L(not_divide_by_zero); cg->div(rt); cg->mov(rlo, cg->eax); cg->mov(rhi, cg->edx); cg->L(done); } void CPU::NewRec::X64Compiler::TestOverflow(const Xbyak::Reg32& result) { SwitchToFarCode(true, &Xbyak::CodeGenerator::jo); BackupHostState(); // toss the result ClearHostReg(result.getIdx()); EndBlockWithException(Exception::Ov); RestoreHostState(); SwitchToNearCode(false); } void CPU::NewRec::X64Compiler::Compile_dst_op( CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&), void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, u32), bool commutative, bool overflow) { if (cf.valid_host_s && cf.valid_host_t) { if (cf.host_d == cf.host_s) { (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); } else if (cf.host_d == cf.host_t) { if (commutative) { (cg->*op)(CFGetRegD(cf), CFGetRegS(cf)); } else { cg->mov(RWARG1, CFGetRegT(cf)); cg->mov(CFGetRegD(cf), CFGetRegS(cf)); (cg->*op)(CFGetRegD(cf), RWARG1); } } else { cg->mov(CFGetRegD(cf), CFGetRegS(cf)); (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); } } else if (commutative && (cf.const_s || cf.const_t)) { const Reg32 rd = CFGetRegD(cf); (cf.const_s) ? MoveTToReg(rd, cf) : MoveSToReg(rd, cf); if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0) (cg->*op_const)(CFGetRegD(cf), cv); else overflow = false; } else if (cf.const_s) { // need to backup T? if (cf.valid_host_d && cf.valid_host_t && cf.host_d == cf.host_t) { cg->mov(RWARG1, CFGetRegT(cf)); MoveSToReg(CFGetRegD(cf), cf); (cg->*op)(CFGetRegD(cf), RWARG1); } else { MoveSToReg(CFGetRegD(cf), cf); (cg->*op)(CFGetRegD(cf), CFGetRegT(cf)); } } else if (cf.const_t) { MoveSToReg(CFGetRegD(cf), cf); if (const u32 cv = GetConstantRegU32(cf.MipsT()); cv != 0) (cg->*op_const)(CFGetRegD(cf), cv); else overflow = false; } else if (cf.valid_host_s) { if (cf.host_d != cf.host_s) cg->mov(CFGetRegD(cf), CFGetRegS(cf)); (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT())); } else if (cf.valid_host_t) { if (cf.host_d != cf.host_t) cg->mov(CFGetRegD(cf), CFGetRegT(cf)); (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsS())); } else { cg->mov(CFGetRegD(cf), MipsPtr(cf.MipsS())); (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT())); } if (overflow) { DebugAssert(cf.valid_host_d); TestOverflow(CFGetRegD(cf)); } } void CPU::NewRec::X64Compiler::Compile_add(CompileFlags cf) { Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, g_settings.cpu_recompiler_memory_exceptions); } void CPU::NewRec::X64Compiler::Compile_addu(CompileFlags cf) { Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, false); } void CPU::NewRec::X64Compiler::Compile_sub(CompileFlags cf) { Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, g_settings.cpu_recompiler_memory_exceptions); } void CPU::NewRec::X64Compiler::Compile_subu(CompileFlags cf) { Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, false); } void CPU::NewRec::X64Compiler::Compile_and(CompileFlags cf) { // special cases - and with self -> self, and with 0 -> 0 const Reg32 regd = CFGetRegD(cf); if (cf.MipsS() == cf.MipsT()) { MoveSToReg(regd, cf); return; } else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) { cg->xor_(regd, regd); return; } Compile_dst_op(cf, &CodeGenerator::and_, &CodeGenerator::and_, true, false); } void CPU::NewRec::X64Compiler::Compile_or(CompileFlags cf) { // or/nor with 0 -> no effect const Reg32 regd = CFGetRegD(cf); if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT()) { cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); return; } Compile_dst_op(cf, &CodeGenerator::or_, &CodeGenerator::or_, true, false); } void CPU::NewRec::X64Compiler::Compile_xor(CompileFlags cf) { const Reg32 regd = CFGetRegD(cf); if (cf.MipsS() == cf.MipsT()) { // xor with self -> zero cg->xor_(regd, regd); return; } else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0)) { // xor with zero -> no effect cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf); return; } Compile_dst_op(cf, &CodeGenerator::xor_, &CodeGenerator::xor_, true, false); } void CPU::NewRec::X64Compiler::Compile_nor(CompileFlags cf) { Compile_or(cf); cg->not_(CFGetRegD(cf)); } void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf) { Compile_slt(cf, true); } void CPU::NewRec::X64Compiler::Compile_sltu(CompileFlags cf) { Compile_slt(cf, false); } void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf, bool sign) { const Reg32 rd = CFGetRegD(cf); const Reg32 rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1; const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; if (!cf.valid_host_s) MoveSToReg(rs, cf); // Case where D == S, can't use xor because it changes flags // TODO: swap and reverse op for constants if (rd != rs && rd != rt) cg->xor_(rd, rd); if (cf.valid_host_t) cg->cmp(rs, CFGetRegT(cf)); else if (cf.const_t) cg->cmp(rs, GetConstantRegU32(cf.MipsT())); else cg->cmp(rs, MipsPtr(cf.MipsT())); if (rd == rs || rd == rt) cg->mov(rd, 0); sign ? cg->setl(rd.cvt8()) : cg->setb(rd.cvt8()); } Xbyak::Reg32 CPU::NewRec::X64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional& address, const std::optional& reg /* = std::nullopt */) { const u32 imm = inst->i.imm_sext32(); if (cf.valid_host_s && imm == 0 && !reg.has_value()) return CFGetRegS(cf); const Reg32 dst = reg.has_value() ? reg.value() : RWARG1; if (address.has_value()) { cg->mov(dst, address.value()); } else { if (cf.valid_host_s) { if (const Reg32 src = CFGetRegS(cf); src != dst) cg->mov(dst, CFGetRegS(cf)); } else { cg->mov(dst, MipsPtr(cf.MipsS())); } if (imm != 0) cg->add(dst, inst->i.imm_sext32()); } return dst; } template Xbyak::Reg32 CPU::NewRec::X64Compiler::GenerateLoad(const Xbyak::Reg32& addr_reg, MemoryAccessSize size, bool sign, bool use_fastmem, const RegAllocFn& dst_reg_alloc) { if (use_fastmem) { m_cycles += Bus::RAM_READ_TICKS; const Reg32 dst = dst_reg_alloc(); if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) { DebugAssert(addr_reg != RWARG3); cg->mov(RWARG3, addr_reg.cvt32()); cg->shr(RXARG3, Bus::FASTMEM_LUT_PAGE_SHIFT); cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]); } const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE; u8* start = cg->getCurr(); switch (size) { case MemoryAccessSize::Byte: { sign ? cg->movsx(dst, cg->byte[membase + addr_reg.cvt64()]) : cg->movzx(dst, cg->byte[membase + addr_reg.cvt64()]); } break; case MemoryAccessSize::HalfWord: { sign ? cg->movsx(dst, cg->word[membase + addr_reg.cvt64()]) : cg->movzx(dst, cg->word[membase + addr_reg.cvt64()]); } break; case MemoryAccessSize::Word: { cg->mov(dst, cg->word[membase + addr_reg.cvt64()]); } break; } u8* end = cg->getCurr(); while ((end - start) < BACKPATCH_JMP_SIZE) { cg->nop(); end = cg->getCurr(); } AddLoadStoreInfo(start, static_cast(end - start), static_cast(addr_reg.getIdx()), static_cast(dst.getIdx()), size, sign, true); return dst; } if (addr_reg != RWARG1) cg->mov(RWARG1, addr_reg); const bool checked = g_settings.cpu_recompiler_memory_exceptions; switch (size) { case MemoryAccessSize::Byte: { cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryByte) : reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte)); } break; case MemoryAccessSize::HalfWord: { cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryHalfWord) : reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord)); } break; case MemoryAccessSize::Word: { cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::ReadMemoryWord) : reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord)); } break; } // TODO: turn this into an asm function instead if (checked) { cg->test(RXRET, RXRET); BackupHostState(); SwitchToFarCode(true, &CodeGenerator::js); // flush regs, but not pc, it's going to get overwritten // flush cycles because of the GTE instruction stuff... Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); // cause_bits = (-result << 2) | BD | cop_n cg->mov(RWARG1, RWRET); cg->neg(RWARG1); cg->shl(RWARG1, 2); cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException( static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); cg->mov(RWARG2, m_current_instruction_pc); cg->call(static_cast(&CPU::RaiseException)); m_dirty_pc = false; EndAndLinkBlock(std::nullopt, true, false); SwitchToNearCode(false); RestoreHostState(); } const Xbyak::Reg32 dst_reg = dst_reg_alloc(); switch (size) { case MemoryAccessSize::Byte: { sign ? cg->movsx(dst_reg, RWRET.cvt8()) : cg->movzx(dst_reg, RWRET.cvt8()); } break; case MemoryAccessSize::HalfWord: { sign ? cg->movsx(dst_reg, RWRET.cvt16()) : cg->movzx(dst_reg, RWRET.cvt16()); } break; case MemoryAccessSize::Word: { if (dst_reg != RWRET) cg->mov(dst_reg, RWRET); } break; } return dst_reg; } void CPU::NewRec::X64Compiler::GenerateStore(const Xbyak::Reg32& addr_reg, const Xbyak::Reg32& value_reg, MemoryAccessSize size, bool use_fastmem) { if (use_fastmem) { if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) { DebugAssert(addr_reg != RWARG3 && value_reg != RWARG3); cg->mov(RWARG3, addr_reg.cvt32()); cg->shr(RXARG3, Bus::FASTMEM_LUT_PAGE_SHIFT); cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]); } const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE; u8* start = cg->getCurr(); switch (size) { case MemoryAccessSize::Byte: cg->mov(cg->byte[membase + addr_reg.cvt64()], value_reg.cvt8()); break; case MemoryAccessSize::HalfWord: cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt16()); break; case MemoryAccessSize::Word: cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt32()); break; } u8* end = cg->getCurr(); while ((end - start) < BACKPATCH_JMP_SIZE) { cg->nop(); end = cg->getCurr(); } AddLoadStoreInfo(start, static_cast(end - start), static_cast(addr_reg.getIdx()), static_cast(value_reg.getIdx()), size, false, false); return; } if (addr_reg != RWARG1) cg->mov(RWARG1, addr_reg); if (value_reg != RWARG2) cg->mov(RWARG2, value_reg); const bool checked = g_settings.cpu_recompiler_memory_exceptions; switch (size) { case MemoryAccessSize::Byte: { cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryByte) : reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte)); } break; case MemoryAccessSize::HalfWord: { cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryHalfWord) : reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); } break; case MemoryAccessSize::Word: { cg->call(checked ? reinterpret_cast(&Recompiler::Thunks::WriteMemoryWord) : reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord)); } break; } // TODO: turn this into an asm function instead if (checked) { cg->test(RWRET, RWRET); BackupHostState(); SwitchToFarCode(true, &CodeGenerator::jnz); // flush regs, but not pc, it's going to get overwritten // flush cycles because of the GTE instruction stuff... Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION); // cause_bits = (result << 2) | BD | cop_n cg->mov(RWARG1, RWRET); cg->shl(RWARG1, 2); cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException( static_cast(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)); cg->mov(RWARG2, m_current_instruction_pc); cg->call(reinterpret_cast(static_cast(&CPU::RaiseException))); m_dirty_pc = false; EndAndLinkBlock(std::nullopt, true, false); SwitchToNearCode(false); RestoreHostState(); } } void CPU::NewRec::X64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { const std::optional addr_reg = g_settings.gpu_pgxp_enable ? std::optional(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional(); FlushForLoadStore(address, false, use_fastmem); const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); const Reg32 data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() { if (cf.MipsT() == Reg::zero) return RWRET; return Reg32(AllocateHostReg(GetFlagsForNewLoadDelayedReg(), EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT())); }); if (g_settings.gpu_pgxp_enable) { Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG1, inst->bits); cg->mov(RWARG2, addr); cg->mov(RWARG3, data); cg->call(s_pgxp_mem_load_functions[static_cast(size)][static_cast(sign)]); FreeHostReg(addr_reg.value().getIdx()); } } void CPU::NewRec::X64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { DebugAssert(size == MemoryAccessSize::Word && !sign); FlushForLoadStore(address, false, use_fastmem); // TODO: if address is constant, this can be simplified.. // If we're coming from another block, just flush the load delay and hope for the best.. if (m_load_delay_dirty) UpdateLoadDelay(); // We'd need to be careful here if we weren't overwriting it.. const Reg32 addr = Reg32(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP)); ComputeLoadStoreAddressArg(cf, address, addr); cg->mov(RWARG1, addr); cg->and_(RWARG1, ~0x3u); GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); if (inst->r.rt == Reg::zero) { FreeHostReg(addr.getIdx()); return; } // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is // never written back. NOTE: can't trust T in cf because of the flush const Reg rt = inst->r.rt; Reg32 value; if (m_load_delay_register == rt) { const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ? AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) : m_load_delay_value_register; RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt); value = Reg32(existing_ld_rt); } else { if constexpr (EMULATE_LOAD_DELAYS) { value = Reg32(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt)); if (HasConstantReg(rt)) cg->mov(value, GetConstantRegU32(rt)); else if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) cg->mov(value, Reg32(rtreg.value())); else cg->mov(value, MipsPtr(rt)); } else { value = Reg32(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt)); } } DebugAssert(value != cg->ecx); cg->mov(cg->ecx, addr); cg->and_(cg->ecx, 3); cg->shl(cg->ecx, 3); // *8 // TODO for other arch: reverse subtract DebugAssert(RWARG2 != cg->ecx); cg->mov(RWARG2, 24); cg->sub(RWARG2, cg->ecx); if (inst->op == InstructionOp::lwl) { // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // new_value = (value & mask) | (RWRET << (24 - shift)); cg->mov(addr, 0xFFFFFFu); cg->shr(addr, cg->cl); cg->and_(value, addr); cg->mov(cg->ecx, RWARG2); cg->shl(RWRET, cg->cl); cg->or_(value, RWRET); } else { // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // new_value = (value & mask) | (RWRET >> shift); cg->shr(RWRET, cg->cl); cg->mov(addr, 0xFFFFFF00u); cg->mov(cg->ecx, RWARG2); cg->shl(addr, cg->cl); cg->and_(value, addr); cg->or_(value, RWRET); } FreeHostReg(addr.getIdx()); } void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { const std::optional addr_reg = g_settings.gpu_pgxp_enable ? std::optional(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional(); FlushForLoadStore(address, false, use_fastmem); const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); const u32 index = static_cast(inst->r.rt.GetValue()); const auto [ptr, action] = GetGTERegisterPointer(index, true); switch (action) { case GTERegisterAccessAction::Ignore: { break; } case GTERegisterAccessAction::Direct: { cg->mov(cg->dword[PTR(ptr)], RWRET); break; } case GTERegisterAccessAction::SignExtend16: { cg->movsx(RWRET, RWRET.cvt16()); cg->mov(cg->dword[PTR(ptr)], RWRET); break; } case GTERegisterAccessAction::ZeroExtend16: { cg->movzx(RWRET, RWRET.cvt16()); cg->mov(cg->dword[PTR(ptr)], RWRET); break; } case GTERegisterAccessAction::CallHandler: { Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG2, RWRET); cg->mov(RWARG1, index); cg->call(>E::WriteRegister); break; } case GTERegisterAccessAction::PushFIFO: { // SXY0 <- SXY1 // SXY1 <- SXY2 // SXY2 <- SXYP DebugAssert(RWRET != RWARG1 && RWRET != RWARG2); cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]); cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]); cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1); cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2); cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWRET); break; } default: { Panic("Unknown action"); return; } } if (g_settings.gpu_pgxp_enable) { Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG3, RWRET); cg->mov(RWARG2, addr); cg->mov(RWARG1, inst->bits); cg->call(reinterpret_cast(&PGXP::CPU_LWC2)); FreeHostReg(addr_reg.value().getIdx()); } } void CPU::NewRec::X64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { const std::optional addr_reg = g_settings.gpu_pgxp_enable ? std::optional(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional(); FlushForLoadStore(address, true, use_fastmem); const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); const Reg32 data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2; if (!cf.valid_host_t) MoveTToReg(RWARG2, cf); GenerateStore(addr, data, size, use_fastmem); if (g_settings.gpu_pgxp_enable) { Flush(FLUSH_FOR_C_CALL); MoveMIPSRegToReg(RWARG3, cf.MipsT()); cg->mov(RWARG2, addr); cg->mov(RWARG1, inst->bits); cg->call(s_pgxp_mem_store_functions[static_cast(size)]); FreeHostReg(addr_reg.value().getIdx()); } } void CPU::NewRec::X64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { DebugAssert(size == MemoryAccessSize::Word && !sign); FlushForLoadStore(address, true, use_fastmem); // TODO: if address is constant, this can be simplified.. // We'd need to be careful here if we weren't overwriting it.. const Reg32 addr = Reg32(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP)); ComputeLoadStoreAddressArg(cf, address, addr); cg->mov(RWARG1, addr); cg->and_(RWARG1, ~0x3u); GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the flush const Reg rt = inst->r.rt; const Reg32 value = RWARG2; DebugAssert(value != cg->ecx); if (HasConstantReg(rt)) cg->mov(value, GetConstantRegU32(rt)); else if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) cg->mov(value, Reg32(rtreg.value())); else cg->mov(value, MipsPtr(rt)); cg->mov(cg->ecx, addr); cg->and_(cg->ecx, 3); cg->shl(cg->ecx, 3); // *8 if (inst->op == InstructionOp::swl) { // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; // new_value = (RWRET & mem_mask) | (value >> (24 - shift)); cg->mov(RWARG3, 0xFFFFFF00u); cg->shl(RWARG3, cg->cl); cg->and_(RWRET, RWARG3); cg->mov(RWARG3, 24); cg->sub(RWARG3, cg->ecx); cg->mov(cg->ecx, RWARG3); cg->shr(value, cg->cl); cg->or_(value, RWRET); } else { // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); // new_value = (RWRET & mem_mask) | (value << shift); cg->shl(value, cg->cl); DebugAssert(RWARG3 != cg->ecx); cg->mov(RWARG3, 24); cg->sub(RWARG3, cg->ecx); cg->mov(cg->ecx, RWARG3); cg->mov(RWARG3, 0x00FFFFFFu); cg->shr(RWARG3, cg->cl); cg->and_(RWRET, RWARG3); cg->or_(value, RWRET); } FreeHostReg(addr.getIdx()); cg->mov(RWARG1, addr); cg->and_(RWARG1, ~0x3u); GenerateStore(RWARG1, value, MemoryAccessSize::Word, use_fastmem); } void CPU::NewRec::X64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { const u32 index = static_cast(inst->r.rt.GetValue()); const auto [ptr, action] = GetGTERegisterPointer(index, false); switch (action) { case GTERegisterAccessAction::Direct: { cg->mov(RWARG2, cg->dword[PTR(ptr)]); } break; case GTERegisterAccessAction::CallHandler: { // should already be flushed.. except in fastmem case Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG1, index); cg->call(>E::ReadRegister); cg->mov(RWARG2, RWRET); } break; default: { Panic("Unknown action"); } break; } // PGXP makes this a giant pain. if (!g_settings.gpu_pgxp_enable) { FlushForLoadStore(address, true, use_fastmem); const Reg32 addr = ComputeLoadStoreAddressArg(cf, address); GenerateStore(addr, RWARG2, size, use_fastmem); return; } // TODO: This can be simplified because we don't need to validate in PGXP.. const Reg32 addr_reg = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); const Reg32 data_backup = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); FlushForLoadStore(address, true, use_fastmem); ComputeLoadStoreAddressArg(cf, address, addr_reg); cg->mov(data_backup, RWARG2); GenerateStore(addr_reg, RWARG2, size, use_fastmem); Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG3, data_backup); cg->mov(RWARG2, addr_reg); cg->mov(RWARG1, inst->bits); cg->call(reinterpret_cast(&PGXP::CPU_SWC2)); FreeHostReg(addr_reg.getIdx()); FreeHostReg(data_backup.getIdx()); } void CPU::NewRec::X64Compiler::Compile_mtc0(CompileFlags cf) { const Cop0Reg reg = static_cast(MipsD()); const u32* ptr = GetCop0RegPtr(reg); const u32 mask = GetCop0RegWriteMask(reg); if (!ptr) { Compile_Fallback(); return; } // TODO: const apply mask const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1; const u32 constant_value = cf.const_t ? GetConstantRegU32(cf.MipsT()) : 0; if (mask == 0) { // if it's a read-only register, ignore Log_DebugPrintf("Ignoring write to read-only cop0 reg %u", static_cast(reg)); return; } // for some registers, we need to test certain bits const bool needs_bit_test = (reg == Cop0Reg::SR); const Reg32 changed_bits = RWARG3; // update value if (cf.valid_host_t) { cg->mov(RWARG1, rt); cg->mov(RWARG2, cg->dword[PTR(ptr)]); cg->and_(RWARG1, mask); if (needs_bit_test) { cg->mov(changed_bits, RWARG2); cg->xor_(changed_bits, RWARG1); } cg->and_(RWARG2, ~mask); cg->or_(RWARG2, RWARG1); cg->mov(cg->dword[PTR(ptr)], RWARG2); } else { cg->mov(RWARG2, cg->dword[PTR(ptr)]); if (needs_bit_test) { cg->mov(changed_bits, RWARG2); cg->xor_(changed_bits, constant_value & mask); } cg->and_(RWARG2, ~mask); cg->or_(RWARG2, constant_value & mask); cg->mov(cg->dword[PTR(ptr)], RWARG2); } if (reg == Cop0Reg::SR) { // TODO: replace with register backup // We could just inline the whole thing.. Flush(FLUSH_FOR_C_CALL); cg->test(changed_bits, 1u << 16); SwitchToFarCode(true, &CodeGenerator::jnz); cg->push(RWARG1); cg->push(RWARG2); cg->call(&CPU::UpdateMemoryPointers); cg->pop(RWARG2); cg->pop(RWARG1); cg->mov(RMEMBASE, cg->qword[PTR(&g_state.fastmem_base)]); SwitchToNearCode(true); } if (reg == Cop0Reg::SR || reg == Cop0Reg::CAUSE) { const Reg32 sr = (reg == Cop0Reg::SR) ? RWARG2 : (cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]), RWARG1); TestInterrupts(sr); } if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions) { // TODO: DCIC handling for debug breakpoints Log_WarningPrintf("TODO: DCIC handling for debug breakpoints"); } } void CPU::NewRec::X64Compiler::Compile_rfe(CompileFlags cf) { // shift mode bits right two, preserving upper bits static constexpr u32 mode_bits_mask = UINT32_C(0b1111); cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]); cg->mov(RWARG2, RWARG1); cg->shr(RWARG2, 2); cg->and_(RWARG1, ~mode_bits_mask); cg->and_(RWARG2, mode_bits_mask); cg->or_(RWARG1, RWARG2); cg->mov(cg->dword[PTR(&g_state.cop0_regs.sr.bits)], RWARG1); TestInterrupts(RWARG1); } void CPU::NewRec::X64Compiler::TestInterrupts(const Xbyak::Reg32& sr) { // if Iec == 0 then goto no_interrupt Label no_interrupt; cg->test(sr, 1); cg->jz(no_interrupt, CodeGenerator::T_NEAR); // sr & cause cg->and_(sr, cg->dword[PTR(&g_state.cop0_regs.cause.bits)]); // ((sr & cause) & 0xff00) == 0 goto no_interrupt cg->test(sr, 0xFF00); SwitchToFarCode(true, &CodeGenerator::jnz); BackupHostState(); Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL); // Can't use EndBlockWithException() here, because it'll use the wrong PC. // Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown. if (!iinfo->is_last_instruction) { cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false, (inst + 1)->cop.cop_n)); cg->mov(RWARG2, m_compiler_pc); cg->call(static_cast(&CPU::RaiseException)); m_dirty_pc = false; EndAndLinkBlock(std::nullopt, true, false); } else { if (m_dirty_pc) cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc); m_dirty_pc = false; cg->mov(cg->dword[PTR(&g_state.downcount)], 0); EndAndLinkBlock(std::nullopt, false, true); } RestoreHostState(); SwitchToNearCode(false); cg->L(no_interrupt); } void CPU::NewRec::X64Compiler::Compile_mfc2(CompileFlags cf) { const u32 index = inst->cop.Cop2Index(); const Reg rt = inst->r.rt; const auto [ptr, action] = GetGTERegisterPointer(index, false); if (action == GTERegisterAccessAction::Ignore) return; u32 hreg; if (action == GTERegisterAccessAction::Direct) { hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(), EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); cg->mov(Reg32(hreg), cg->dword[PTR(ptr)]); } else if (action == GTERegisterAccessAction::CallHandler) { Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG1, index); cg->call(>E::ReadRegister); hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(), EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt); cg->mov(Reg32(hreg), RWRET); } else { Panic("Unknown action"); return; } if (g_settings.gpu_pgxp_enable) { Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG1, inst->bits); cg->mov(RWARG2, Reg32(hreg)); cg->call(reinterpret_cast(&PGXP::CPU_MFC2)); } } void CPU::NewRec::X64Compiler::Compile_mtc2(CompileFlags cf) { const u32 index = inst->cop.Cop2Index(); const auto [ptr, action] = GetGTERegisterPointer(index, true); if (action == GTERegisterAccessAction::Ignore) return; if (action == GTERegisterAccessAction::Direct) { if (cf.const_t) { cg->mov(cg->dword[PTR(ptr)], GetConstantRegU32(cf.MipsT())); } else if (cf.valid_host_t) { cg->mov(cg->dword[PTR(ptr)], CFGetRegT(cf)); } else { cg->mov(RWARG1, MipsPtr(cf.MipsT())); cg->mov(cg->dword[PTR(ptr)], RWARG1); } } else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16) { const bool sign = (action == GTERegisterAccessAction::SignExtend16); if (cf.const_t) { const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT())); cg->mov(cg->dword[PTR(ptr)], sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv)); } else if (cf.valid_host_t) { sign ? cg->movsx(RWARG1, Reg16(cf.host_t)) : cg->movzx(RWARG1, Reg16(cf.host_t)); cg->mov(cg->dword[PTR(ptr)], RWARG1); } else { sign ? cg->movsx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]) : cg->movzx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]); cg->mov(cg->dword[PTR(ptr)], RWARG1); } } else if (action == GTERegisterAccessAction::CallHandler) { Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG1, index); MoveTToReg(RWARG2, cf); cg->call(>E::WriteRegister); } else if (action == GTERegisterAccessAction::PushFIFO) { // SXY0 <- SXY1 // SXY1 <- SXY2 // SXY2 <- SXYP cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]); cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]); if (!cf.const_t && !cf.valid_host_t) cg->mov(RWARG3, MipsPtr(cf.MipsT())); cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1); cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2); if (cf.const_t) cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], GetConstantRegU32(cf.MipsT())); else if (cf.valid_host_t) cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], CFGetRegT(cf)); else cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWARG3); } else { Panic("Unknown action"); } } void CPU::NewRec::X64Compiler::Compile_cop2(CompileFlags cf) { TickCount func_ticks; GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks); Flush(FLUSH_FOR_C_CALL); cg->mov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK); cg->call(reinterpret_cast(func)); AddGTETicks(func_ticks); } u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size, TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask, u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed, bool is_load) { CodeGenerator acg(thunk_space, thunk_code); CodeGenerator* cg = &acg; static constexpr u32 GPR_SIZE = 8; // on win32, we need to reserve an additional 32 bytes shadow space when calling out to C #ifdef _WIN32 static constexpr u32 SHADOW_SIZE = 32; #else static constexpr u32 SHADOW_SIZE = 0; #endif // save regs u32 num_gprs = 0; for (u32 i = 0; i < NUM_HOST_REGS; i++) { if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) num_gprs++; } const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE) + SHADOW_SIZE; if (stack_size > 0) { cg->sub(cg->rsp, stack_size); u32 stack_offset = SHADOW_SIZE; for (u32 i = 0; i < NUM_HOST_REGS; i++) { if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) { cg->mov(cg->qword[cg->rsp + stack_offset], Reg64(i)); stack_offset += GPR_SIZE; } } } if (cycles_to_add != 0) cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_add); if (address_register != static_cast(RWARG1.getIdx())) cg->mov(RWARG1, Reg32(address_register)); if (!is_load) { if (data_register != static_cast(RWARG2.getIdx())) cg->mov(RWARG2, Reg32(data_register)); } switch (size) { case MemoryAccessSize::Byte: { cg->call(is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryByte) : reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryByte)); } break; case MemoryAccessSize::HalfWord: { cg->call(is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) : reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord)); } break; case MemoryAccessSize::Word: { cg->call(is_load ? reinterpret_cast(&Recompiler::Thunks::UncheckedReadMemoryWord) : reinterpret_cast(&Recompiler::Thunks::UncheckedWriteMemoryWord)); } break; } if (is_load) { const Reg32 dst = Reg32(data_register); switch (size) { case MemoryAccessSize::Byte: { is_signed ? cg->movsx(dst, RWRET.cvt8()) : cg->movzx(dst, RWRET.cvt8()); } break; case MemoryAccessSize::HalfWord: { is_signed ? cg->movsx(dst, RWRET.cvt16()) : cg->movzx(dst, RWRET.cvt16()); } break; case MemoryAccessSize::Word: { if (dst != RWRET) cg->mov(dst, RWRET); } break; } } if (cycles_to_remove != 0) cg->sub(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_remove); // restore regs if (stack_size > 0) { u32 stack_offset = SHADOW_SIZE; for (u32 i = 0; i < NUM_HOST_REGS; i++) { if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i)) { cg->mov(Reg64(i), cg->qword[cg->rsp + stack_offset]); stack_offset += GPR_SIZE; } } cg->add(cg->rsp, stack_size); } cg->jmp(static_cast(code_address) + code_size); // fill the rest of it with nops, if any DebugAssert(code_size >= BACKPATCH_JMP_SIZE); if (code_size > BACKPATCH_JMP_SIZE) std::memset(static_cast(code_address) + BACKPATCH_JMP_SIZE, 0x90, code_size - BACKPATCH_JMP_SIZE); return static_cast(cg->getSize()); }