From a3013efbcad705a326d668d996b8d2f4f11d1172 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sat, 30 Mar 2024 00:04:35 +1000 Subject: [PATCH] CPU/NewRec: Fix PGXP desync Wobbly geometry in Threads of Fate. --- src/core/cpu_code_cache_private.h | 4 -- src/core/cpu_newrec_compiler_aarch32.cpp | 88 +++++++++++++++--------- src/core/cpu_newrec_compiler_aarch64.cpp | 84 ++++++++++++++-------- src/core/cpu_newrec_compiler_riscv64.cpp | 84 ++++++++++++++-------- src/core/cpu_newrec_compiler_x64.cpp | 85 +++++++++++++++-------- 5 files changed, 221 insertions(+), 124 deletions(-) diff --git a/src/core/cpu_code_cache_private.h b/src/core/cpu_code_cache_private.h index b36b9bb7e..cb4544bb0 100644 --- a/src/core/cpu_code_cache_private.h +++ b/src/core/cpu_code_cache_private.h @@ -19,10 +19,6 @@ #include #include -#ifdef ENABLE_RECOMPILER -// #include "cpu_recompiler_types.h" -#endif - namespace CPU::CodeCache { enum : u32 diff --git a/src/core/cpu_newrec_compiler_aarch32.cpp b/src/core/cpu_newrec_compiler_aarch32.cpp index 57a200b06..0592b7f93 100644 --- a/src/core/cpu_newrec_compiler_aarch32.cpp +++ b/src/core/cpu_newrec_compiler_aarch32.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "cpu_newrec_compiler_aarch32.h" @@ -1637,9 +1637,9 @@ void CPU::NewRec::AArch32Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize { // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // new_value = (value & mask) | (RWRET << (24 - shift)); - EmitMov(addr, 0xFFFFFFu); - armAsm->lsr(addr, addr, RARG2); - armAsm->and_(value, value, addr); + EmitMov(RARG4, 0xFFFFFFu); + armAsm->lsr(RARG4, RARG4, RARG2); + armAsm->and_(value, value, RARG4); armAsm->lsl(RRET, RRET, RARG3); armAsm->orr(value, value, RRET); } @@ -1648,27 +1648,40 @@ void CPU::NewRec::AArch32Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // new_value = (value & mask) | (RWRET >> shift); armAsm->lsr(RRET, RRET, RARG2); - EmitMov(addr, 0xFFFFFF00u); - armAsm->lsl(addr, addr, RARG3); - armAsm->and_(value, value, addr); + EmitMov(RARG4, 0xFFFFFF00u); + armAsm->lsl(RARG4, RARG4, RARG3); + armAsm->and_(value, value, RARG4); armAsm->orr(value, value, RRET); } FreeHostReg(addr.GetCode()); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RARG3, value); + armAsm->and_(RARG2, addr, armCheckLogicalConstant(~0x3u)); + EmitMov(RARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_LW)); + } } void CPU::NewRec::AArch32Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, true); const std::optional addr_reg = g_settings.gpu_pgxp_enable ? std::optional(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional(); FlushForLoadStore(address, false, use_fastmem); const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); - GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; }); + const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action]() { + return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ? + Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : + RRET; + }); - const u32 index = static_cast(inst->r.rt.GetValue()); - const auto [ptr, action] = GetGTERegisterPointer(index, true); switch (action) { case GTERegisterAccessAction::Ignore: @@ -1678,28 +1691,28 @@ void CPU::NewRec::AArch32Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz case GTERegisterAccessAction::Direct: { - armAsm->str(RRET, PTR(ptr)); + armAsm->str(value, PTR(ptr)); break; } case GTERegisterAccessAction::SignExtend16: { - armAsm->sxth(RRET, RRET); - armAsm->str(RRET, PTR(ptr)); + armAsm->sxth(RARG3, value); + armAsm->str(RARG3, PTR(ptr)); break; } case GTERegisterAccessAction::ZeroExtend16: { - armAsm->uxth(RRET, RRET); - armAsm->str(RRET, PTR(ptr)); + armAsm->uxth(RARG3, value); + armAsm->str(RARG3, PTR(ptr)); break; } case GTERegisterAccessAction::CallHandler: { Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RARG2, RRET); + armAsm->mov(RARG2, value); EmitMov(RARG1, index); EmitCall(reinterpret_cast(>E::WriteRegister)); break; @@ -1710,12 +1723,12 @@ void CPU::NewRec::AArch32Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz // SXY0 <- SXY1 // SXY1 <- SXY2 // SXY2 <- SXYP - DebugAssert(RRET.GetCode() != RARG2.GetCode() && RRET.GetCode() != RARG3.GetCode()); + DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode()); armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0])); armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0])); armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0])); armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0])); - armAsm->str(RRET, PTR(&g_state.gte_regs.SXY2[0])); + armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0])); break; } @@ -1729,11 +1742,13 @@ void CPU::NewRec::AArch32Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz if (g_settings.gpu_pgxp_enable) { Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RARG3, RRET); + armAsm->mov(RARG3, value); + if (value.GetCode() != RRET.GetCode()) + FreeHostReg(value.GetCode()); armAsm->mov(RARG2, addr); + FreeHostReg(addr_reg.value().GetCode()); EmitMov(RARG1, inst->bits); EmitCall(reinterpret_cast(&PGXP::CPU_LWC2)); - FreeHostReg(addr_reg.value().GetCode()); } } @@ -1782,17 +1797,18 @@ void CPU::NewRec::AArch32Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the flush const Reg rt = inst->r.rt; - const Register value = RARG2; - if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) - armAsm->mov(value, Register(rtreg.value())); - else if (HasConstantReg(rt)) - EmitMov(value, GetConstantRegU32(rt)); - else - armAsm->ldr(value, MipsPtr(rt)); + const Register value = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2; + MoveMIPSRegToReg(value, rt); armAsm->and_(RSCRATCH, addr, 3); armAsm->lsl(RSCRATCH, RSCRATCH, 3); // *8 + // Don't need the original address anymore. + if (!g_settings.gpu_pgxp_enable) + FreeHostReg(addr.GetCode()); + else + armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u)); + if (inst->op == InstructionOp::swl) { // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; @@ -1820,10 +1836,18 @@ void CPU::NewRec::AArch32Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize armAsm->orr(value, value, RRET); } - FreeHostReg(addr.GetCode()); + GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - armAsm->and_(RARG1, addr, armCheckLogicalConstant(~0x3u)); - GenerateStore(RARG1, value, MemoryAccessSize::Word, use_fastmem); + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RARG3, value); + FreeHostReg(value.GetCode()); + armAsm->mov(RARG2, addr); + FreeHostReg(addr.GetCode()); + EmitMov(RARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_SW)); + } } void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, @@ -1877,10 +1901,10 @@ void CPU::NewRec::AArch32Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSiz Flush(FLUSH_FOR_C_CALL); armAsm->mov(RARG3, data_backup); armAsm->mov(RARG2, addr_reg); - EmitMov(RARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); FreeHostReg(addr_reg.GetCode()); FreeHostReg(data_backup.GetCode()); + EmitMov(RARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_SWC2)); } void CPU::NewRec::AArch32Compiler::Compile_mtc0(CompileFlags cf) diff --git a/src/core/cpu_newrec_compiler_aarch64.cpp b/src/core/cpu_newrec_compiler_aarch64.cpp index a9820cb3b..13a47201b 100644 --- a/src/core/cpu_newrec_compiler_aarch64.cpp +++ b/src/core/cpu_newrec_compiler_aarch64.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "cpu_newrec_compiler_aarch64.h" @@ -1616,9 +1616,9 @@ void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize { // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // new_value = (value & mask) | (RWRET << (24 - shift)); - EmitMov(addr, 0xFFFFFFu); - armAsm->lsrv(addr, addr, RWARG2); - armAsm->and_(value, value, addr); + EmitMov(RWARG4, 0xFFFFFFu); + armAsm->lsrv(RWARG4, RWARG4, RWARG2); + armAsm->and_(value, value, RWARG4); armAsm->lslv(RWRET, RWRET, RWARG3); armAsm->orr(value, value, RWRET); } @@ -1627,27 +1627,40 @@ void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // new_value = (value & mask) | (RWRET >> shift); armAsm->lsrv(RWRET, RWRET, RWARG2); - EmitMov(addr, 0xFFFFFF00u); - armAsm->lslv(addr, addr, RWARG3); - armAsm->and_(value, value, addr); + EmitMov(RWARG4, 0xFFFFFF00u); + armAsm->lslv(RWARG4, RWARG4, RWARG3); + armAsm->and_(value, value, RWARG4); armAsm->orr(value, value, RWRET); } FreeHostReg(addr.GetCode()); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RWARG3, value); + armAsm->and_(RWARG2, addr, armCheckLogicalConstant(~0x3u)); + EmitMov(RWARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_LW)); + } } void CPU::NewRec::AArch64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, true); const std::optional addr_reg = g_settings.gpu_pgxp_enable ? std::optional(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional(); FlushForLoadStore(address, false, use_fastmem); const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); - GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); + const WRegister value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action]() { + return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ? + WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : + RWRET; + }); - const u32 index = static_cast(inst->r.rt.GetValue()); - const auto [ptr, action] = GetGTERegisterPointer(index, true); switch (action) { case GTERegisterAccessAction::Ignore: @@ -1657,28 +1670,28 @@ void CPU::NewRec::AArch64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz case GTERegisterAccessAction::Direct: { - armAsm->str(RWRET, PTR(ptr)); + armAsm->str(value, PTR(ptr)); break; } case GTERegisterAccessAction::SignExtend16: { - armAsm->sxth(RWRET, RWRET); - armAsm->str(RWRET, PTR(ptr)); + armAsm->sxth(RWARG3, value); + armAsm->str(RWARG3, PTR(ptr)); break; } case GTERegisterAccessAction::ZeroExtend16: { - armAsm->uxth(RWRET, RWRET); - armAsm->str(RWRET, PTR(ptr)); + armAsm->uxth(RWARG3, value); + armAsm->str(RWARG3, PTR(ptr)); break; } case GTERegisterAccessAction::CallHandler: { Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RWARG2, RWRET); + armAsm->mov(RWARG2, value); EmitMov(RWARG1, index); EmitCall(reinterpret_cast(>E::WriteRegister)); break; @@ -1689,12 +1702,12 @@ void CPU::NewRec::AArch64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz // SXY0 <- SXY1 // SXY1 <- SXY2 // SXY2 <- SXYP - DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode()); + DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode()); armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0])); armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0])); armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0])); armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0])); - armAsm->str(RWRET, PTR(&g_state.gte_regs.SXY2[0])); + armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0])); break; } @@ -1708,11 +1721,13 @@ void CPU::NewRec::AArch64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz if (g_settings.gpu_pgxp_enable) { Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RWARG3, RWRET); + armAsm->mov(RWARG3, value); + if (value.GetCode() != RWRET.GetCode()) + FreeHostReg(value.GetCode()); armAsm->mov(RWARG2, addr); + FreeHostReg(addr_reg.value().GetCode()); EmitMov(RWARG1, inst->bits); EmitCall(reinterpret_cast(&PGXP::CPU_LWC2)); - FreeHostReg(addr_reg.value().GetCode()); } } @@ -1761,17 +1776,18 @@ void CPU::NewRec::AArch64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the flush const Reg rt = inst->r.rt; - const WRegister value = RWARG2; - if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) - armAsm->mov(value, WRegister(rtreg.value())); - else if (HasConstantReg(rt)) - EmitMov(value, GetConstantRegU32(rt)); - else - armAsm->ldr(value, MipsPtr(rt)); + const WRegister value = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2; + MoveMIPSRegToReg(value, rt); armAsm->and_(RWSCRATCH, addr, 3); armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8 + // Don't need the original address anymore. + if (!g_settings.gpu_pgxp_enable) + FreeHostReg(addr.GetCode()); + else + armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u)); + if (inst->op == InstructionOp::swl) { // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; @@ -1801,8 +1817,18 @@ void CPU::NewRec::AArch64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize FreeHostReg(addr.GetCode()); - armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u)); - GenerateStore(RWARG1, value, MemoryAccessSize::Word, use_fastmem); + GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + armAsm->mov(RWARG3, value); + FreeHostReg(value.GetCode()); + armAsm->mov(RWARG2, addr); + FreeHostReg(addr.GetCode()); + EmitMov(RWARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_SW)); + } } void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, diff --git a/src/core/cpu_newrec_compiler_riscv64.cpp b/src/core/cpu_newrec_compiler_riscv64.cpp index 4473bc836..a847828ae 100644 --- a/src/core/cpu_newrec_compiler_riscv64.cpp +++ b/src/core/cpu_newrec_compiler_riscv64.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "cpu_newrec_compiler_riscv64.h" @@ -1925,9 +1925,9 @@ void CPU::NewRec::RISCV64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize { // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // new_value = (value & mask) | (RWRET << (24 - shift)); - EmitMov(addr, 0xFFFFFFu); - rvAsm->SRLW(addr, addr, RARG2); - rvAsm->AND(value, value, addr); + EmitMov(RSCRATCH, 0xFFFFFFu); + rvAsm->SRLW(RSCRATCH, RSCRATCH, RARG2); + rvAsm->AND(value, value, RSCRATCH); rvAsm->SLLW(RRET, RRET, RARG3); rvAsm->OR(value, value, RRET); } @@ -1936,26 +1936,39 @@ void CPU::NewRec::RISCV64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // new_value = (value & mask) | (RWRET >> shift); rvAsm->SRLW(RRET, RRET, RARG2); - EmitMov(addr, 0xFFFFFF00u); - rvAsm->SLLW(addr, addr, RARG3); - rvAsm->AND(value, value, addr); + EmitMov(RSCRATCH, 0xFFFFFF00u); + rvAsm->SLLW(RSCRATCH, RSCRATCH, RARG3); + rvAsm->AND(value, value, RSCRATCH); rvAsm->OR(value, value, RRET); } FreeHostReg(addr.Index()); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + rvAsm->MV(RARG3, value); + rvAsm->ANDI(RARG2, addr, ~0x3u); + EmitMov(RARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_LW)); + } } void CPU::NewRec::RISCV64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, true); const std::optional addr_reg = g_settings.gpu_pgxp_enable ? std::optional(GPR(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional(); FlushForLoadStore(address, false, use_fastmem); const GPR addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); - GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; }); + const GPR value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action]() { + return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ? + GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : + RRET; + }); - const u32 index = static_cast(inst->r.rt.GetValue()); - const auto [ptr, action] = GetGTERegisterPointer(index, true); switch (action) { case GTERegisterAccessAction::Ignore: @@ -1965,28 +1978,28 @@ void CPU::NewRec::RISCV64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz case GTERegisterAccessAction::Direct: { - rvAsm->SW(RRET, PTR(ptr)); + rvAsm->SW(value, PTR(ptr)); break; } case GTERegisterAccessAction::SignExtend16: { - EmitSExtH(RRET, RRET); - rvAsm->SW(RRET, PTR(ptr)); + EmitSExtH(RARG3, value); + rvAsm->SW(RARG3, PTR(ptr)); break; } case GTERegisterAccessAction::ZeroExtend16: { - EmitUExtH(RRET, RRET); - rvAsm->SW(RRET, PTR(ptr)); + EmitUExtH(RARG3, value); + rvAsm->SW(RARG3, PTR(ptr)); break; } case GTERegisterAccessAction::CallHandler: { Flush(FLUSH_FOR_C_CALL); - rvAsm->MV(RARG2, RRET); + rvAsm->MV(RARG2, value); EmitMov(RARG1, index); EmitCall(reinterpret_cast(>E::WriteRegister)); break; @@ -1997,12 +2010,12 @@ void CPU::NewRec::RISCV64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz // SXY0 <- SXY1 // SXY1 <- SXY2 // SXY2 <- SXYP - DebugAssert(RRET.Index() != RARG2.Index() && RRET.Index() != RARG3.Index()); + DebugAssert(value.Index() != RARG2.Index() && value.Index() != RARG3.Index()); rvAsm->LW(RARG2, PTR(&g_state.gte_regs.SXY1[0])); rvAsm->LW(RARG3, PTR(&g_state.gte_regs.SXY2[0])); rvAsm->SW(RARG2, PTR(&g_state.gte_regs.SXY0[0])); rvAsm->SW(RARG3, PTR(&g_state.gte_regs.SXY1[0])); - rvAsm->SW(RRET, PTR(&g_state.gte_regs.SXY2[0])); + rvAsm->SW(value, PTR(&g_state.gte_regs.SXY2[0])); break; } @@ -2016,11 +2029,13 @@ void CPU::NewRec::RISCV64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSiz if (g_settings.gpu_pgxp_enable) { Flush(FLUSH_FOR_C_CALL); - rvAsm->MV(RARG3, RRET); + rvAsm->MV(RARG3, value); + if (value.Index() != RRET.Index()) + FreeHostReg(value.Index()); rvAsm->MV(RARG2, addr); + FreeHostReg(addr_reg.value().Index()); EmitMov(RARG1, inst->bits); EmitCall(reinterpret_cast(&PGXP::CPU_LWC2)); - FreeHostReg(addr_reg.value().Index()); } } @@ -2068,17 +2083,18 @@ void CPU::NewRec::RISCV64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the flush const Reg rt = inst->r.rt; - const GPR value = RARG2; - if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) - rvAsm->MV(value, GPR(rtreg.value())); - else if (HasConstantReg(rt)) - EmitMov(value, GetConstantRegU32(rt)); - else - rvAsm->LW(value, PTR(&g_state.regs.r[static_cast(rt)])); + const GPR value = g_settings.gpu_pgxp_enable ? GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2; + MoveMIPSRegToReg(value, rt); rvAsm->ANDI(RSCRATCH, addr, 3); rvAsm->SLLIW(RSCRATCH, RSCRATCH, 3); // *8 + // Don't need the original address anymore. + if (!g_settings.gpu_pgxp_enable) + FreeHostReg(addr.Index()); + else + rvAsm->ANDI(addr, addr, ~0x3u); + if (inst->op == InstructionOp::swl) { // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; @@ -2108,8 +2124,18 @@ void CPU::NewRec::RISCV64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize FreeHostReg(addr.Index()); - rvAsm->ANDI(RARG1, addr, ~0x3u); - GenerateStore(RARG1, value, MemoryAccessSize::Word, use_fastmem); + GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + rvAsm->MV(RARG3, value); + FreeHostReg(value.Index()); + rvAsm->MV(RARG2, addr); + FreeHostReg(addr.Index()); + EmitMov(RARG1, inst->bits); + EmitCall(reinterpret_cast(&PGXP::CPU_SW)); + } } void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, diff --git a/src/core/cpu_newrec_compiler_x64.cpp b/src/core/cpu_newrec_compiler_x64.cpp index d476c28d4..6de39d0a8 100644 --- a/src/core/cpu_newrec_compiler_x64.cpp +++ b/src/core/cpu_newrec_compiler_x64.cpp @@ -1577,13 +1577,14 @@ void CPU::NewRec::X64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize siz cg->mov(RWARG2, 24); cg->sub(RWARG2, cg->ecx); + const Reg32& temp = (RWARG3 == cg->ecx) ? RWARG4 : RWARG3; if (inst->op == InstructionOp::lwl) { // const u32 mask = UINT32_C(0x00FFFFFF) >> shift; // new_value = (value & mask) | (RWRET << (24 - shift)); - cg->mov(addr, 0xFFFFFFu); - cg->shr(addr, cg->cl); - cg->and_(value, addr); + cg->mov(temp, 0xFFFFFFu); + cg->shr(temp, cg->cl); + cg->and_(value, temp); cg->mov(cg->ecx, RWARG2); cg->shl(RWRET, cg->cl); cg->or_(value, RWRET); @@ -1593,28 +1594,42 @@ void CPU::NewRec::X64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize siz // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift); // new_value = (value & mask) | (RWRET >> shift); cg->shr(RWRET, cg->cl); - cg->mov(addr, 0xFFFFFF00u); + cg->mov(temp, 0xFFFFFF00u); cg->mov(cg->ecx, RWARG2); - cg->shl(addr, cg->cl); - cg->and_(value, addr); + cg->shl(temp, cg->cl); + cg->and_(value, temp); cg->or_(value, RWRET); } FreeHostReg(addr.getIdx()); + + if (g_settings.gpu_pgxp_enable) + { + DebugAssert(value != RWARG3); + cg->mov(RWARG3, value); + cg->mov(RWARG2, addr); + cg->and_(RWARG2, ~0x3u); + cg->mov(RWARG1, inst->bits); + cg->call(reinterpret_cast(&PGXP::CPU_LW)); + } } void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, const std::optional& address) { + const u32 index = static_cast(inst->r.rt.GetValue()); + const auto [ptr, action] = GetGTERegisterPointer(index, true); const std::optional addr_reg = g_settings.gpu_pgxp_enable ? std::optional(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) : std::optional(); FlushForLoadStore(address, false, use_fastmem); const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg); - GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); + const Reg32 value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action]() { + return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ? + Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : + RWRET; + }); - const u32 index = static_cast(inst->r.rt.GetValue()); - const auto [ptr, action] = GetGTERegisterPointer(index, true); switch (action) { case GTERegisterAccessAction::Ignore: @@ -1624,28 +1639,28 @@ void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize si case GTERegisterAccessAction::Direct: { - cg->mov(cg->dword[PTR(ptr)], RWRET); + cg->mov(cg->dword[PTR(ptr)], value); break; } case GTERegisterAccessAction::SignExtend16: { - cg->movsx(RWRET, RWRET.cvt16()); - cg->mov(cg->dword[PTR(ptr)], RWRET); + cg->movsx(RWARG3, value.cvt16()); + cg->mov(cg->dword[PTR(ptr)], RWARG3); break; } case GTERegisterAccessAction::ZeroExtend16: { - cg->movzx(RWRET, RWRET.cvt16()); - cg->mov(cg->dword[PTR(ptr)], RWRET); + cg->movzx(RWARG3, value.cvt16()); + cg->mov(cg->dword[PTR(ptr)], RWARG3); break; } case GTERegisterAccessAction::CallHandler: { Flush(FLUSH_FOR_C_CALL); - cg->mov(RWARG2, RWRET); + cg->mov(RWARG2, value); cg->mov(RWARG1, index); cg->call(>E::WriteRegister); break; @@ -1656,12 +1671,12 @@ void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize si // SXY0 <- SXY1 // SXY1 <- SXY2 // SXY2 <- SXYP - DebugAssert(RWRET != RWARG1 && RWRET != RWARG2); + DebugAssert(value != RWARG1 && value != RWARG2); cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]); cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]); cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1); cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2); - cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWRET); + cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], value); break; } @@ -1675,11 +1690,13 @@ void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize si if (g_settings.gpu_pgxp_enable) { Flush(FLUSH_FOR_C_CALL); - cg->mov(RWARG3, RWRET); + cg->mov(RWARG3, value); + if (value != RWRET) + FreeHostReg(value.getIdx()); cg->mov(RWARG2, addr); + FreeHostReg(addr_reg.value().getIdx()); cg->mov(RWARG1, inst->bits); cg->call(reinterpret_cast(&PGXP::CPU_LWC2)); - FreeHostReg(addr_reg.value().getIdx()); } } @@ -1726,19 +1743,20 @@ void CPU::NewRec::X64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize siz // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the flush const Reg rt = inst->r.rt; - const Reg32 value = RWARG2; + const Reg32 value = g_settings.gpu_pgxp_enable ? Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2; DebugAssert(value != cg->ecx); - if (HasConstantReg(rt)) - cg->mov(value, GetConstantRegU32(rt)); - else if (const std::optional rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value()) - cg->mov(value, Reg32(rtreg.value())); - else - cg->mov(value, MipsPtr(rt)); + MoveMIPSRegToReg(value, rt); cg->mov(cg->ecx, addr); cg->and_(cg->ecx, 3); cg->shl(cg->ecx, 3); // *8 + // Don't need the original address anymore. + if (g_settings.gpu_pgxp_enable) + cg->and_(addr, ~0x3u); + else + FreeHostReg(addr.getIdx()); + if (inst->op == InstructionOp::swl) { // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift; @@ -1769,11 +1787,18 @@ void CPU::NewRec::X64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize siz cg->or_(value, RWRET); } - FreeHostReg(addr.getIdx()); + GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - cg->mov(RWARG1, addr); - cg->and_(RWARG1, ~0x3u); - GenerateStore(RWARG1, value, MemoryAccessSize::Word, use_fastmem); + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG3, value); + FreeHostReg(value.getIdx()); + cg->mov(RWARG2, addr); + FreeHostReg(addr.getIdx()); + cg->mov(RWARG1, inst->bits); + cg->call(reinterpret_cast(&PGXP::CPU_SW)); + } } void CPU::NewRec::X64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,