From 3df7b22c37dca5a58d15c7471e8a6c659ea1381b Mon Sep 17 00:00:00 2001
From: Connor McLaughlin <mclaughc@outlook.com>
Date: Wed, 25 Sep 2019 15:40:08 +1000
Subject: [PATCH] GTE: Fix NCDS

---
 src/pse/gte.cpp     | 84 +++++++++++++++++++++++++++++----------------
 src/pse/gte.h       | 26 +++++++++++---
 src/pse/gte.inl     | 65 ++++++++++++++++++++++++++++++-----
 src/pse/gte_types.h |  8 ++---
 4 files changed, 136 insertions(+), 47 deletions(-)
diff --git a/src/pse/gte.cpp b/src/pse/gte.cpp
index 0921e2965..1c1400314 100644
--- a/src/pse/gte.cpp
+++ b/src/pse/gte.cpp
@@ -588,52 +588,78 @@ s64 Core::VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z)
   return s64(s32(A[0]) * s32(B_x)) + s64(s32(A[1]) * s32(B_y)) + s64(s32(A[2]) * s32(B_z));
 }
 
+void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm)
+{
+#define dot3(i)                                                                                                        \
+  TruncateAndSetMAC<i + 1>(                                                                                            \
+    TruncateMAC<i + 1>(TruncateMAC<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) +                \
+      s64(s32(M[i][2]) * s32(Vz)),                                                                                     \
+    sf)
+
+  dot3(0);
+  dot3(1);
+  dot3(2);
+
+#undef dot3
+
+  TruncateAndSetIR<1>(m_regs.MAC1, lm);
+  TruncateAndSetIR<2>(m_regs.MAC2, lm);
+  TruncateAndSetIR<3>(m_regs.MAC3, lm);
+}
+
+void Core::MulMatVec(const s16 M[3][3], const u32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm)
+{
+#define dot3(i)                                                                                                        \
+  TruncateAndSetMAC<i + 1>(static_cast<s64>(ZeroExtend64(T[i]) << 12) +                                                \
+                             TruncateMAC<i + 1>(TruncateMAC<i + 1>(TruncateMAC<i + 1>(s64(s32(M[i][0]) * s32(Vx))) +   \
+                                                                   s64(s32(M[i][1]) * s32(Vy))) +                      \
+                                                s64(s32(M[i][2]) * s32(Vz))),                                          \
+                           sf)
+
+  dot3(0);
+  dot3(1);
+  dot3(2);
+
+#undef dot3
+
+  TruncateAndSetIR<1>(m_regs.MAC1, lm);
+  TruncateAndSetIR<2>(m_regs.MAC2, lm);
+  TruncateAndSetIR<3>(m_regs.MAC3, lm);
+}
+
 void Core::NCDS(const s16 V[3], bool sf, bool lm)
 {
   const u8 shift = sf ? 12 : 0;
 
   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12)
-  m_regs.MAC1 = TruncateMAC<1>(VecDot(m_regs.LLM[0], V) >> shift);
-  m_regs.MAC2 = TruncateMAC<2>(VecDot(m_regs.LLM[1], V) >> shift);
-  m_regs.MAC3 = TruncateMAC<3>(VecDot(m_regs.LLM[2], V) >> shift);
-  SetIR(0, m_regs.MAC1, lm);
-  SetIR(1, m_regs.MAC2, lm);
-  SetIR(2, m_regs.MAC3, lm);
+  MulMatVec(m_regs.LLM, V[0], V[1], V[2], sf, lm);
 
   // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12)
-  // TODO: First multiply should check overflow
-  m_regs.MAC1 = TruncateMAC<1>(
-    ((ZeroExtend64(m_regs.RBK) * 0x1000) + VecDot(m_regs.LCM[0], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift);
-  m_regs.MAC2 = TruncateMAC<2>(
-    ((ZeroExtend64(m_regs.GBK) * 0x1000) + VecDot(m_regs.LCM[1], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift);
-  m_regs.MAC3 = TruncateMAC<3>(
-    ((ZeroExtend64(m_regs.BBK) * 0x1000) + VecDot(m_regs.LCM[2], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift);
-  SetIR(1, m_regs.MAC1, lm);
-  SetIR(2, m_regs.MAC2, lm);
-  SetIR(3, m_regs.MAC3, lm);
+  MulMatVec(m_regs.LCM, m_regs.BK, m_regs.IR1, m_regs.IR2, m_regs.IR3, sf, lm);
 
   // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4          ;<--- for NCDx/NCCx
-  m_regs.MAC1 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[0]) * static_cast<u16>(m_regs.IR1)) << 4);
-  m_regs.MAC2 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[1]) * static_cast<u16>(m_regs.IR2)) << 4);
-  m_regs.MAC3 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[2]) * static_cast<u16>(m_regs.IR3)) << 4);
-  SetIR(1, m_regs.MAC1, false);
-  SetIR(2, m_regs.MAC2, false);
-  SetIR(3, m_regs.MAC3, false);
+  TruncateAndSetMAC<1>((s64(ZeroExtend64(m_regs.RGBC[0])) << 4) * s64(m_regs.MAC1), false);
+  TruncateAndSetMAC<2>((s64(ZeroExtend64(m_regs.RGBC[1])) << 4) * s64(m_regs.MAC2), false);
+  TruncateAndSetMAC<3>((s64(ZeroExtend64(m_regs.RGBC[2])) << 4) * s64(m_regs.MAC3), false);
 
   // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0                   ;<--- for NCDx only
-  // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12)       ;<--- for NCDx/NCCx
-  m_regs.MAC1 = TruncateMAC<1>(m_regs.MAC1 + ((s32(m_regs.RFC) - m_regs.MAC1) * m_regs.IR0));
-  m_regs.MAC2 = TruncateMAC<2>(m_regs.MAC2 + ((s32(m_regs.GFC) - m_regs.MAC2) * m_regs.IR0));
-  m_regs.MAC3 = TruncateMAC<3>(m_regs.MAC3 + ((s32(m_regs.BFC) - m_regs.MAC3) * m_regs.IR0));
+  //   [IR1,IR2,IR3] = (([RFC,GFC,BFC] SHL 12) - [MAC1,MAC2,MAC3]) SAR (sf*12)
+  TruncateAndSetIR<1>(s32(s64(ZeroExtend64(m_regs.FC[0]) << 12) - s64(m_regs.MAC1)) >> (sf ? 12 : 0), false);
+  TruncateAndSetIR<2>(s32(s64(ZeroExtend64(m_regs.FC[1]) << 12) - s64(m_regs.MAC2)) >> (sf ? 12 : 0), false);
+  TruncateAndSetIR<3>(s32(s64(ZeroExtend64(m_regs.FC[2]) << 12) - s64(m_regs.MAC3)) >> (sf ? 12 : 0), false);
 
+  //   [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3])
   // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12)       ;<--- for NCDx/NCCx
-  m_regs.MAC1 >>= shift;
-  m_regs.MAC2 >>= shift;
-  m_regs.MAC3 >>= shift;
+  TruncateAndSetMAC<1>(s64(s32(m_regs.IR1) * s32(m_regs.IR0)) + s64(m_regs.MAC1), sf);
+  TruncateAndSetMAC<2>(s64(s32(m_regs.IR2) * s32(m_regs.IR0)) + s64(m_regs.MAC2), sf);
+  TruncateAndSetMAC<3>(s64(s32(m_regs.IR3) * s32(m_regs.IR0)) + s64(m_regs.MAC3), sf);
 
   // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
   PushRGB(TruncateRGB<0>(m_regs.MAC1 / 16), TruncateRGB<1>(m_regs.MAC2 / 16), TruncateRGB<2>(m_regs.MAC3 / 16),
           m_regs.RGBC[3]);
+  TruncateAndSetIR<1>(m_regs.MAC1, lm);
+  TruncateAndSetIR<2>(m_regs.MAC2, lm);
+  TruncateAndSetIR<3>(m_regs.MAC3, lm);
 }
 
 void Core::Execute_NCDS(Instruction inst)
diff --git a/src/pse/gte.h b/src/pse/gte.h
index 48d784f6d..f1dd0d945 100644
--- a/src/pse/gte.h
+++ b/src/pse/gte.h
@@ -26,14 +26,26 @@ public:
   void ExecuteInstruction(Instruction inst);
 
 private:
+  static constexpr s64 MAC0_MIN_VALUE = -(INT64_C(1) << 43);
+  static constexpr s64 MAC0_MAX_VALUE = (INT64_C(1) << 43) - 1;
+  static constexpr s64 MAC123_MIN_VALUE = -(INT64_C(1) << 43);
+  static constexpr s64 MAC123_MAX_VALUE = (INT64_C(1) << 43) - 1;
+  static constexpr s32 IR0_MIN_VALUE = 0x0000;
+  static constexpr s32 IR0_MAX_VALUE = 0x1000;
+  static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15);
+  static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1;
+
   template<u32 index>
-  s32 TruncateMAC(s64 value);
+  s64 TruncateMAC(s64 value);
+
+  template<u32 index>
+  s32 TruncateAndSetMAC(s64 value, bool sf);
 
   template<u32 index>
   u8 TruncateRGB(s32 value);
 
   template<u32 index>
-  void SetIR(s32 value, bool lm);
+  s16 TruncateAndSetIR(s32 value, bool lm);
 
   void SetMAC(u32 index, s64 value);
   void SetIR(u32 index, s32 value, bool lm);
@@ -45,8 +57,14 @@ private:
   s32 Divide(s32 dividend, s32 divisor);
   s32 SaturateDivide(s32 result);
 
-  static s64 VecDot(const s16 A[3], const s16 B[3]);
-  static s64 VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z);
+  s64 VecDot(const s16 A[3], const s16 B[3]);
+  s64 VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z);
+
+  // 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3]
+  void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm);
+  
+  // 3x3 matrix * 3x1 vector with translation, updates MAC[1-3] and IR[1-3]
+  void MulMatVec(const s16 M[3][3], const u32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm);
 
   void RTPS(const s16 V[3], bool sf);
   void NCDS(const s16 V[3], bool sf, bool lm);
diff --git a/src/pse/gte.inl b/src/pse/gte.inl
index 697e66332..ec1afe292 100644
--- a/src/pse/gte.inl
+++ b/src/pse/gte.inl
@@ -19,9 +19,11 @@ u8 GTE::Core::TruncateRGB(s32 value)
 }
 
 template<u32 index>
-s32 GTE::Core::TruncateMAC(s64 value)
+s64 GTE::Core::TruncateMAC(s64 value)
 {
-  if (value < INT64_C(-2147483648))
+  constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE;
+  constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE;
+  if (value < MIN_VALUE)
   {
     if constexpr (index == 0)
       m_regs.FLAG.mac0_underflow = true;
@@ -32,9 +34,9 @@ s32 GTE::Core::TruncateMAC(s64 value)
     else if constexpr (index == 3)
       m_regs.FLAG.mac3_underflow = true;
 
-    return static_cast<s32>(UINT32_C(0x80000000));
+    return MIN_VALUE;
   }
-  else if (value > INT64_C(2147483647))
+  else if (value > MAX_VALUE)
   {
     if constexpr (index == 0)
       m_regs.FLAG.mac0_overflow = true;
@@ -45,13 +47,60 @@ s32 GTE::Core::TruncateMAC(s64 value)
     else if constexpr (index == 3)
       m_regs.FLAG.mac3_overflow = true;
 
-    return static_cast<s32>(UINT32_C(0x7FFFFFFF));
+    return MAX_VALUE;
+  }
+  else
+  {
+    return value;
   }
-
-  return static_cast<s32>(value);
 }
 
 template<u32 index>
-void GTE::Core::SetIR(s32 value, bool lm)
+s32 GTE::Core::TruncateAndSetMAC(s64 value, bool sf)
 {
+  value = TruncateMAC<index>(value);
+
+  // shift should be done before storing to avoid losing precision
+  if (sf)
+    value >>= 12;
+
+  const s32 value32 = static_cast<s32>(value);
+  m_regs.dr32[24 + index] = value32;
+  return value32;
+}
+
+template<u32 index>
+s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm)
+{
+  constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE;
+  constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE;
+  const s32 actual_min_value = lm ? 0 : -0x8000;
+  if (value < actual_min_value)
+  {
+    value = actual_min_value;
+    if constexpr (index == 0)
+      m_regs.FLAG.ir0_saturated = true;
+    else if constexpr (index == 1)
+      m_regs.FLAG.ir1_saturated = true;
+    else if constexpr (index == 2)
+      m_regs.FLAG.ir2_saturated = true;
+    else if constexpr (index == 3)
+      m_regs.FLAG.ir3_saturated = true;
+  }
+  else if (value > MAX_VALUE)
+  {
+    value = MAX_VALUE;
+    if constexpr (index == 0)
+      m_regs.FLAG.ir0_saturated = true;
+    else if constexpr (index == 1)
+      m_regs.FLAG.ir1_saturated = true;
+    else if constexpr (index == 2)
+      m_regs.FLAG.ir2_saturated = true;
+    else if constexpr (index == 3)
+      m_regs.FLAG.ir3_saturated = true;
+  }
+
+  // store sign-extended 16-bit value as 32-bit
+  m_regs.dr32[8 + index] = value;
+  return static_cast<s16>(value);
 }
diff --git a/src/pse/gte_types.h b/src/pse/gte_types.h
index 865c09419..d106181b1 100644
--- a/src/pse/gte_types.h
+++ b/src/pse/gte_types.h
@@ -103,14 +103,10 @@ union Regs
     s32 TR[3];     // 37-39
     s16 LLM[3][3]; // 40-44
     u16 pad18;     // 44
-    u32 RBK;       // 45
-    u32 GBK;       // 46
-    u32 BBK;       // 47
+    u32 BK[3];     // 45-47
     s16 LCM[3][3]; // 48-52
     u16 pad19;     // 52
-    u32 RFC;       // 53
-    u32 GFC;       // 54
-    u32 BFC;       // 55
+    u32 FC[3];     // 53-55
     s32 OFX;       // 56
     s32 OFY;       // 57
     u16 H;         // 58