From 3df7b22c37dca5a58d15c7471e8a6c659ea1381b Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Wed, 25 Sep 2019 15:40:08 +1000 Subject: [PATCH] GTE: Fix NCDS --- src/pse/gte.cpp | 84 +++++++++++++++++++++++++++++---------------- src/pse/gte.h | 26 +++++++++++--- src/pse/gte.inl | 65 ++++++++++++++++++++++++++++++----- src/pse/gte_types.h | 8 ++--- 4 files changed, 136 insertions(+), 47 deletions(-) diff --git a/src/pse/gte.cpp b/src/pse/gte.cpp index 0921e2965..1c1400314 100644 --- a/src/pse/gte.cpp +++ b/src/pse/gte.cpp @@ -588,52 +588,78 @@ s64 Core::VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z) return s64(s32(A[0]) * s32(B_x)) + s64(s32(A[1]) * s32(B_y)) + s64(s32(A[2]) * s32(B_z)); } +void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm) +{ +#define dot3(i) \ + TruncateAndSetMAC( \ + TruncateMAC(TruncateMAC(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) + \ + s64(s32(M[i][2]) * s32(Vz)), \ + sf) + + dot3(0); + dot3(1); + dot3(2); + +#undef dot3 + + TruncateAndSetIR<1>(m_regs.MAC1, lm); + TruncateAndSetIR<2>(m_regs.MAC2, lm); + TruncateAndSetIR<3>(m_regs.MAC3, lm); +} + +void Core::MulMatVec(const s16 M[3][3], const u32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm) +{ +#define dot3(i) \ + TruncateAndSetMAC(static_cast(ZeroExtend64(T[i]) << 12) + \ + TruncateMAC(TruncateMAC(TruncateMAC(s64(s32(M[i][0]) * s32(Vx))) + \ + s64(s32(M[i][1]) * s32(Vy))) + \ + s64(s32(M[i][2]) * s32(Vz))), \ + sf) + + dot3(0); + dot3(1); + dot3(2); + +#undef dot3 + + TruncateAndSetIR<1>(m_regs.MAC1, lm); + TruncateAndSetIR<2>(m_regs.MAC2, lm); + TruncateAndSetIR<3>(m_regs.MAC3, lm); +} + void Core::NCDS(const s16 V[3], bool sf, bool lm) { const u8 shift = sf ? 12 : 0; // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12) - m_regs.MAC1 = TruncateMAC<1>(VecDot(m_regs.LLM[0], V) >> shift); - m_regs.MAC2 = TruncateMAC<2>(VecDot(m_regs.LLM[1], V) >> shift); - m_regs.MAC3 = TruncateMAC<3>(VecDot(m_regs.LLM[2], V) >> shift); - SetIR(0, m_regs.MAC1, lm); - SetIR(1, m_regs.MAC2, lm); - SetIR(2, m_regs.MAC3, lm); + MulMatVec(m_regs.LLM, V[0], V[1], V[2], sf, lm); // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12) - // TODO: First multiply should check overflow - m_regs.MAC1 = TruncateMAC<1>( - ((ZeroExtend64(m_regs.RBK) * 0x1000) + VecDot(m_regs.LCM[0], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift); - m_regs.MAC2 = TruncateMAC<2>( - ((ZeroExtend64(m_regs.GBK) * 0x1000) + VecDot(m_regs.LCM[1], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift); - m_regs.MAC3 = TruncateMAC<3>( - ((ZeroExtend64(m_regs.BBK) * 0x1000) + VecDot(m_regs.LCM[2], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift); - SetIR(1, m_regs.MAC1, lm); - SetIR(2, m_regs.MAC2, lm); - SetIR(3, m_regs.MAC3, lm); + MulMatVec(m_regs.LCM, m_regs.BK, m_regs.IR1, m_regs.IR2, m_regs.IR3, sf, lm); // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4 ;<--- for NCDx/NCCx - m_regs.MAC1 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[0]) * static_cast(m_regs.IR1)) << 4); - m_regs.MAC2 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[1]) * static_cast(m_regs.IR2)) << 4); - m_regs.MAC3 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[2]) * static_cast(m_regs.IR3)) << 4); - SetIR(1, m_regs.MAC1, false); - SetIR(2, m_regs.MAC2, false); - SetIR(3, m_regs.MAC3, false); + TruncateAndSetMAC<1>((s64(ZeroExtend64(m_regs.RGBC[0])) << 4) * s64(m_regs.MAC1), false); + TruncateAndSetMAC<2>((s64(ZeroExtend64(m_regs.RGBC[1])) << 4) * s64(m_regs.MAC2), false); + TruncateAndSetMAC<3>((s64(ZeroExtend64(m_regs.RGBC[2])) << 4) * s64(m_regs.MAC3), false); // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 ;<--- for NCDx only - // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12) ;<--- for NCDx/NCCx - m_regs.MAC1 = TruncateMAC<1>(m_regs.MAC1 + ((s32(m_regs.RFC) - m_regs.MAC1) * m_regs.IR0)); - m_regs.MAC2 = TruncateMAC<2>(m_regs.MAC2 + ((s32(m_regs.GFC) - m_regs.MAC2) * m_regs.IR0)); - m_regs.MAC3 = TruncateMAC<3>(m_regs.MAC3 + ((s32(m_regs.BFC) - m_regs.MAC3) * m_regs.IR0)); + // [IR1,IR2,IR3] = (([RFC,GFC,BFC] SHL 12) - [MAC1,MAC2,MAC3]) SAR (sf*12) + TruncateAndSetIR<1>(s32(s64(ZeroExtend64(m_regs.FC[0]) << 12) - s64(m_regs.MAC1)) >> (sf ? 12 : 0), false); + TruncateAndSetIR<2>(s32(s64(ZeroExtend64(m_regs.FC[1]) << 12) - s64(m_regs.MAC2)) >> (sf ? 12 : 0), false); + TruncateAndSetIR<3>(s32(s64(ZeroExtend64(m_regs.FC[2]) << 12) - s64(m_regs.MAC3)) >> (sf ? 12 : 0), false); + // [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3]) // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12) ;<--- for NCDx/NCCx - m_regs.MAC1 >>= shift; - m_regs.MAC2 >>= shift; - m_regs.MAC3 >>= shift; + TruncateAndSetMAC<1>(s64(s32(m_regs.IR1) * s32(m_regs.IR0)) + s64(m_regs.MAC1), sf); + TruncateAndSetMAC<2>(s64(s32(m_regs.IR2) * s32(m_regs.IR0)) + s64(m_regs.MAC2), sf); + TruncateAndSetMAC<3>(s64(s32(m_regs.IR3) * s32(m_regs.IR0)) + s64(m_regs.MAC3), sf); // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] PushRGB(TruncateRGB<0>(m_regs.MAC1 / 16), TruncateRGB<1>(m_regs.MAC2 / 16), TruncateRGB<2>(m_regs.MAC3 / 16), m_regs.RGBC[3]); + TruncateAndSetIR<1>(m_regs.MAC1, lm); + TruncateAndSetIR<2>(m_regs.MAC2, lm); + TruncateAndSetIR<3>(m_regs.MAC3, lm); } void Core::Execute_NCDS(Instruction inst) diff --git a/src/pse/gte.h b/src/pse/gte.h index 48d784f6d..f1dd0d945 100644 --- a/src/pse/gte.h +++ b/src/pse/gte.h @@ -26,14 +26,26 @@ public: void ExecuteInstruction(Instruction inst); private: + static constexpr s64 MAC0_MIN_VALUE = -(INT64_C(1) << 43); + static constexpr s64 MAC0_MAX_VALUE = (INT64_C(1) << 43) - 1; + static constexpr s64 MAC123_MIN_VALUE = -(INT64_C(1) << 43); + static constexpr s64 MAC123_MAX_VALUE = (INT64_C(1) << 43) - 1; + static constexpr s32 IR0_MIN_VALUE = 0x0000; + static constexpr s32 IR0_MAX_VALUE = 0x1000; + static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15); + static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1; + template - s32 TruncateMAC(s64 value); + s64 TruncateMAC(s64 value); + + template + s32 TruncateAndSetMAC(s64 value, bool sf); template u8 TruncateRGB(s32 value); template - void SetIR(s32 value, bool lm); + s16 TruncateAndSetIR(s32 value, bool lm); void SetMAC(u32 index, s64 value); void SetIR(u32 index, s32 value, bool lm); @@ -45,8 +57,14 @@ private: s32 Divide(s32 dividend, s32 divisor); s32 SaturateDivide(s32 result); - static s64 VecDot(const s16 A[3], const s16 B[3]); - static s64 VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z); + s64 VecDot(const s16 A[3], const s16 B[3]); + s64 VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z); + + // 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3] + void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm); + + // 3x3 matrix * 3x1 vector with translation, updates MAC[1-3] and IR[1-3] + void MulMatVec(const s16 M[3][3], const u32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm); void RTPS(const s16 V[3], bool sf); void NCDS(const s16 V[3], bool sf, bool lm); diff --git a/src/pse/gte.inl b/src/pse/gte.inl index 697e66332..ec1afe292 100644 --- a/src/pse/gte.inl +++ b/src/pse/gte.inl @@ -19,9 +19,11 @@ u8 GTE::Core::TruncateRGB(s32 value) } template -s32 GTE::Core::TruncateMAC(s64 value) +s64 GTE::Core::TruncateMAC(s64 value) { - if (value < INT64_C(-2147483648)) + constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE; + constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE; + if (value < MIN_VALUE) { if constexpr (index == 0) m_regs.FLAG.mac0_underflow = true; @@ -32,9 +34,9 @@ s32 GTE::Core::TruncateMAC(s64 value) else if constexpr (index == 3) m_regs.FLAG.mac3_underflow = true; - return static_cast(UINT32_C(0x80000000)); + return MIN_VALUE; } - else if (value > INT64_C(2147483647)) + else if (value > MAX_VALUE) { if constexpr (index == 0) m_regs.FLAG.mac0_overflow = true; @@ -45,13 +47,60 @@ s32 GTE::Core::TruncateMAC(s64 value) else if constexpr (index == 3) m_regs.FLAG.mac3_overflow = true; - return static_cast(UINT32_C(0x7FFFFFFF)); + return MAX_VALUE; + } + else + { + return value; } - - return static_cast(value); } template -void GTE::Core::SetIR(s32 value, bool lm) +s32 GTE::Core::TruncateAndSetMAC(s64 value, bool sf) { + value = TruncateMAC(value); + + // shift should be done before storing to avoid losing precision + if (sf) + value >>= 12; + + const s32 value32 = static_cast(value); + m_regs.dr32[24 + index] = value32; + return value32; +} + +template +s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm) +{ + constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE; + constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE; + const s32 actual_min_value = lm ? 0 : -0x8000; + if (value < actual_min_value) + { + value = actual_min_value; + if constexpr (index == 0) + m_regs.FLAG.ir0_saturated = true; + else if constexpr (index == 1) + m_regs.FLAG.ir1_saturated = true; + else if constexpr (index == 2) + m_regs.FLAG.ir2_saturated = true; + else if constexpr (index == 3) + m_regs.FLAG.ir3_saturated = true; + } + else if (value > MAX_VALUE) + { + value = MAX_VALUE; + if constexpr (index == 0) + m_regs.FLAG.ir0_saturated = true; + else if constexpr (index == 1) + m_regs.FLAG.ir1_saturated = true; + else if constexpr (index == 2) + m_regs.FLAG.ir2_saturated = true; + else if constexpr (index == 3) + m_regs.FLAG.ir3_saturated = true; + } + + // store sign-extended 16-bit value as 32-bit + m_regs.dr32[8 + index] = value; + return static_cast(value); } diff --git a/src/pse/gte_types.h b/src/pse/gte_types.h index 865c09419..d106181b1 100644 --- a/src/pse/gte_types.h +++ b/src/pse/gte_types.h @@ -103,14 +103,10 @@ union Regs s32 TR[3]; // 37-39 s16 LLM[3][3]; // 40-44 u16 pad18; // 44 - u32 RBK; // 45 - u32 GBK; // 46 - u32 BBK; // 47 + u32 BK[3]; // 45-47 s16 LCM[3][3]; // 48-52 u16 pad19; // 52 - u32 RFC; // 53 - u32 GFC; // 54 - u32 BFC; // 55 + u32 FC[3]; // 53-55 s32 OFX; // 56 s32 OFY; // 57 u16 H; // 58