GTE: Fix MVMVA flags due to missing 43-bit-sign-extend

2025-01-29 19:15:38 +00:00 · 2019-09-28 15:18:50 +10:00 · 2019-09-28 15:18:50 +10:00 · 8841934009
parent d3893bc9f2
commit 8841934009
4 changed files with 69 additions and 42 deletions
--- a/src/common/types.h
+++ b/src/common/types.h
@ -190,6 +190,15 @@ constexpr bool ConvertToBoolUnchecked(TValue value)
  return ret;
 }

+// Generic sign extension
+template<int NBITS, typename T>
+constexpr T SignExtendN(T value)
+{
+  // http://graphics.stanford.edu/~seander/bithacks.html#VariableSignExtend
+  constexpr int shift = 8 * sizeof(T) - NBITS;
+  return static_cast<T>((static_cast<std::make_signed_t<T>>(value) << shift) >> shift);
+}
+
 // Enum class bitwise operators
 #define IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(type_)                                                                  \
  inline constexpr type_ operator&(type_ lhs, type_ rhs)                                                               \
--- a/src/pse/gte.cpp
+++ b/src/pse/gte.cpp
@ -405,11 +405,12 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last)
 {
  const u8 shift = sf ? 12 : 0;
 #define dot3(i)                                                                                                        \
-  CheckMACResult<i + 1>(                                                                                               \
+  SignExtendMACResult<i + 1>(                                                                                          \
    (s64(m_regs.TR[i]) << 12) +                                                                                        \
-    CheckMACResult<i + 1>(CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) +         \
-                                                s64(s32(m_regs.RT[i][1]) * s32(V[1]))) +                               \
-                          s64(s32(m_regs.RT[i][2]) * s32(V[2]))))
+    SignExtendMACResult<i + 1>(                                                                                        \
+      SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) +                   \
+                                 s64(s32(m_regs.RT[i][1]) * s32(V[1]))) +                                              \
+      s64(s32(m_regs.RT[i][2]) * s32(V[2]))))

  // IR1 = MAC1 = (TRX*1000h + RT11*VX0 + RT12*VY0 + RT13*VZ0) SAR (sf*12)
  // IR2 = MAC2 = (TRY*1000h + RT21*VX0 + RT22*VY0 + RT23*VZ0) SAR (sf*12)
@ -451,14 +452,17 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last)

  // MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh
  // MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh
-  const s64 Sx = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX), 0);
-  const s64 Sy = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY), 0);
+  const s64 Sx = s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX);
+  const s64 Sy = s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY);
+  TruncateAndSetMAC<0>(Sx, 0);
+  TruncateAndSetMAC<1>(Sy, 0);
  PushSXY(s32(Sx >> 16), s32(Sy >> 16));

  if (last)
  {
    // MAC0=(((H*20000h/SZ3)+1)/2)*DQA+DQB, IR0=MAC0/1000h  ;Depth cueing 0..+1000h
-    const s64 Sz = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.DQA) + s64(m_regs.DQB), 0);
+    const s64 Sz = s64(result) * s64(m_regs.DQA) + s64(m_regs.DQB);
+    TruncateAndSetMAC<0>(Sz, 0);
    TruncateAndSetIR<0>(s32(Sz >> 12), true);
  }
 }
@ -517,8 +521,7 @@ void Core::Execute_AVSZ3(Instruction inst)
 {
  m_regs.FLAG.Clear();

-  const s64 result =
-    TruncateAndSetMAC<0>(s64(m_regs.ZSF3) * s32(u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)), 0);
+  const s64 result = s64(m_regs.ZSF3) * s32(u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3));
  TruncateAndSetMAC<0>(result, 0);
  SetOTZ(s32(result >> 12));

@ -529,8 +532,7 @@ void Core::Execute_AVSZ4(Instruction inst)
 {
  m_regs.FLAG.Clear();

-  const s64 result = TruncateAndSetMAC<0>(
-    s64(m_regs.ZSF4) * s32(u32(m_regs.SZ0) + u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)), 0);
+  const s64 result = s64(m_regs.ZSF4) * s32(u32(m_regs.SZ0) + u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3));
  TruncateAndSetMAC<0>(result, 0);
  SetOTZ(s32(result >> 12));

@ -540,41 +542,31 @@ void Core::Execute_AVSZ4(Instruction inst)
 void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
 {
 #define dot3(i)                                                                                                        \
-  TruncateAndSetMAC<i + 1>(                                                                                            \
-    CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) +          \
-      s64(s32(M[i][2]) * s32(Vz)),                                                                                     \
-    shift)
+  TruncateAndSetMACAndIR<i + 1>(SignExtendMACResult<i + 1>((s64(M[i][0]) * s64(Vx)) + (s64(M[i][1]) * s64(Vy))) +      \
+                                  (s64(M[i][2]) * s64(Vz)),                                                            \
+                                shift, lm)

  dot3(0);
  dot3(1);
  dot3(2);

 #undef dot3
-
-  TruncateAndSetIR<1>(m_regs.MAC1, lm);
-  TruncateAndSetIR<2>(m_regs.MAC2, lm);
-  TruncateAndSetIR<3>(m_regs.MAC3, lm);
 }

 void Core::MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
 {
 #define dot3(i)                                                                                                        \
-  TruncateAndSetMAC<i + 1>(                                                                                            \
-    (s64(T[i]) << 12) +                                                                                                \
-      CheckMACResult<i + 1>(                                                                                           \
-        CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) +      \
-        s64(s32(M[i][2]) * s32(Vz))),                                                                                  \
-    shift)
+  TruncateAndSetMACAndIR<i + 1>(                                                                                       \
+    SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>((s64(T[i]) << 12) + (s64(M[i][0]) * s64(Vx))) +              \
+                               (s64(M[i][1]) * s64(Vy))) +                                                             \
+      (s64(M[i][2]) * s64(Vz)),                                                                                        \
+    shift, lm)

  dot3(0);
  dot3(1);
  dot3(2);

 #undef dot3
-
-  TruncateAndSetIR<1>(m_regs.MAC1, lm);
-  TruncateAndSetIR<2>(m_regs.MAC2, lm);
-  TruncateAndSetIR<3>(m_regs.MAC3, lm);
 }

 void Core::NCCS(const s16 V[3], bool sf, bool lm)
--- a/src/pse/gte.h
+++ b/src/pse/gte.h
@ -35,15 +35,22 @@ private:
  static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15);
  static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1;

-  // Checks for underflow/overflow. Returns the value untouched so it can be threaded through an expression.
+  // Checks for underflow/overflow.
  template<u32 index>
-  s64 CheckMACResult(s64 value);
+  void CheckMACOverflow(s64 value);
+
+  // Checks for underflow/overflow, sign-extending to 31/43 bits.
+  template<u32 index>
+  s64 SignExtendMACResult(s64 value);

  template<u32 index>
-  s64 TruncateAndSetMAC(s64 value, u8 shift);
+  void TruncateAndSetMAC(s64 value, u8 shift);

  template<u32 index>
-  s16 TruncateAndSetIR(s32 value, bool lm);
+  void TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm);
+
+  template<u32 index>
+  void TruncateAndSetIR(s32 value, bool lm);

  template<u32 index>
  u8 TruncateRGB(s32 value);
@ -55,7 +62,7 @@ private:

  // 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3]
  void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
-  
+
  // 3x3 matrix * 3x1 vector with translation, updates MAC[1-3] and IR[1-3]
  void MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);

--- a/src/pse/gte.inl
+++ b/src/pse/gte.inl
@ -1,7 +1,7 @@
 #include "gte.h"

 template<u32 index>
-s64 GTE::Core::CheckMACResult(s64 value)
+void GTE::Core::CheckMACOverflow(s64 value)
 {
  constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE;
  constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE;
@ -27,24 +27,28 @@ s64 GTE::Core::CheckMACResult(s64 value)
    else if constexpr (index == 3)
      m_regs.FLAG.mac3_overflow = true;
  }
-
-  return value;
 }

 template<u32 index>
-s64 GTE::Core::TruncateAndSetMAC(s64 value, u8 shift)
+s64 GTE::Core::SignExtendMACResult(s64 value)
 {
-  value = CheckMACResult<index>(value);
+  CheckMACOverflow<index>(value);
+  return SignExtendN < index == 0 ? 31 : 44 > (value);
+}
+
+template<u32 index>
+void GTE::Core::TruncateAndSetMAC(s64 value, u8 shift)
+{
+  CheckMACOverflow<index>(value);

  // shift should be done before storing to avoid losing precision
  value >>= shift;

  m_regs.dr32[24 + index] = Truncate32(static_cast<u64>(value));
-  return value;
 }

 template<u32 index>
-s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm)
+void GTE::Core::TruncateAndSetIR(s32 value, bool lm)
 {
  constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE;
  constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE;
@ -76,7 +80,22 @@ s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm)

  // store sign-extended 16-bit value as 32-bit
  m_regs.dr32[8 + index] = value;
-  return static_cast<s16>(value);
+}
+
+template<u32 index>
+void GTE::Core::TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm)
+{
+  CheckMACOverflow<index>(value);
+
+  // shift should be done before storing to avoid losing precision
+  value >>= shift;
+
+  // set MAC
+  const s32 value32 = static_cast<s32>(value);
+  m_regs.dr32[24 + index] = value32;
+
+  // set IR
+  TruncateAndSetIR<index>(value32, lm);
 }

 template<u32 index>