CPU/PGXP: Prefer fresh over tainted Z values

Fixes terrain polygon's Z in Wild Arms 2 after battles.
2025-03-06 14:27:44 +00:00 · 2024-05-17 21:18:39 +10:00 · 2024-05-17 21:18:39 +10:00 · ea4efb4e52
parent 5672b0da95
commit ea4efb4e52
1 changed files with 48 additions and 29 deletions
--- a/src/core/cpu_pgxp.cpp
+++ b/src/core/cpu_pgxp.cpp
@ -46,6 +46,8 @@ enum : u32
  VALID_X = (1u << 0),
  VALID_Y = (1u << 1),
  VALID_Z = (1u << 2),
+  VALID_TAINTED_Z = (1u << 31),
+
  VALID_XY = (VALID_X | VALID_Y),
  VALID_XYZ = (VALID_X | VALID_Y | VALID_Z),
  VALID_ALL = (VALID_X | VALID_Y | VALID_Z),
@ -118,7 +120,7 @@ static void LogValueStr(SmallStringBase& str, const char* name, u32 rval, const
 // clang-format on

 static constexpr PGXP_value PGXP_value_invalid = {0.f, 0.f, 0.f, 0, 0};
-static constexpr PGXP_value PGXP_value_zero = {0.f, 0.f, 0.f, 0, VALID_ALL};
+static constexpr PGXP_value PGXP_value_zero = {0.f, 0.f, 0.f, 0, VALID_XY};

 static PGXP_value* s_mem = nullptr;
 static PGXP_value* s_vertex_cache = nullptr;
@ -216,7 +218,7 @@ ALWAYS_INLINE_RELEASE void CPU::PGXP::MakeValid(PGXP_value* pV, u32 psxV)
  pV->x = static_cast<float>(static_cast<s16>(Truncate16(psxV)));
  pV->y = static_cast<float>(static_cast<s16>(Truncate16(psxV >> 16)));
  pV->z = 0.0f;
-  pV->flags = VALID_XY;
+  pV->flags = VALID_XY | VALID_TAINTED_Z;
  pV->value = psxV;
 }

@ -371,16 +373,17 @@ ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem16(const PGXP_value* src, u32 addr

 ALWAYS_INLINE_RELEASE void CPU::PGXP::CopyZIfMissing(PGXP_value& dst, const PGXP_value& src)
 {
-  if (dst.HasValid(COMP_Z))
-    return;
-
-  dst.z = src.z;
+  dst.z = dst.HasValid(COMP_Z) ? dst.z : src.z;
  dst.flags |= (src.flags & VALID_Z);
 }

 ALWAYS_INLINE_RELEASE void CPU::PGXP::SelectZ(PGXP_value& dst, const PGXP_value& src1, const PGXP_value& src2)
 {
-  dst.z = src1.HasValid(COMP_Z) ? src1.z : src2.z;
+  // Prefer src2 if src1 is missing Z, or is potentially an imprecise value, when src2 is precise.
+  dst.z = (!(src1.flags & VALID_Z) ||
+           (src1.flags & VALID_TAINTED_Z && (src2.flags & (VALID_Z | VALID_TAINTED_Z)) == VALID_Z)) ?
+            src2.z :
+            src1.z;
  dst.flags |= ((src1.flags | src2.flags) & VALID_Z);
 }

@ -426,7 +429,7 @@ void CPU::PGXP::LogValueStr(SmallStringBase& str, const char* name, u32 rval, co

    str.append_format(", {{{},{},{}}}", val->x, val->y, val->z);

-    if (val->flags != 0)
+    if (val->flags & VALID_ALL)
    {
      str.append(", valid=");
      if (val->flags & VALID_X)
@ -437,6 +440,9 @@ void CPU::PGXP::LogValueStr(SmallStringBase& str, const char* name, u32 rval, co
        str.append('Z');
    }

+    // if (val->flags & VALID_TAINTED_Z)
+    // str.append(", tainted");
+
    str.append(']');
  }
 }
@ -759,6 +765,8 @@ void CPU::PGXP::CPU_ADDI(u32 instr, u32 rsVal)
  prtVal.y += (prtVal.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (prtVal.y < SHRT_MIN) ? USHRT_MAX + 1 : 0.f;

  prtVal.value = rsVal + tempImm.d;
+
+  prtVal.flags |= VALID_TAINTED_Z;
 }

 void CPU::PGXP::CPU_ANDI(u32 instr, u32 rsVal)
@ -779,13 +787,14 @@ void CPU::PGXP::CPU_ANDI(u32 instr, u32 rsVal)
  prtVal.value = rtVal;
  prtVal.y = 0.f; // remove upper 16-bits
  prtVal.SetValid(COMP_Y);
+  prtVal.flags |= VALID_TAINTED_Z;

  switch (imm(instr))
  {
    case 0:
      // if 0 then x == 0
-      // TODO: x should be valid here
-      prtVal.x = 0.f;
+      prtVal.x = 0.0f;
+      prtVal.SetValid(COMP_X);
      break;
    case 0xFFFF:
      // if saturated then x == x
@ -820,6 +829,7 @@ void CPU::PGXP::CPU_ORI(u32 instr, u32 rsVal)
      // otherwise x is low precision value
      ret.x = vRt.sw.l;
      ret.SetValid(COMP_X);
+      ret.flags |= VALID_TAINTED_Z;
      break;
  }

@ -849,6 +859,7 @@ void CPU::PGXP::CPU_XORI(u32 instr, u32 rsVal)
      // otherwise x is low precision value
      ret.x = vRt.sw.l;
      ret.SetValid(COMP_X);
+      ret.flags |= VALID_TAINTED_Z;
      break;
  }

@ -869,6 +880,7 @@ void CPU::PGXP::CPU_SLTI(u32 instr, u32 rsVal)
  ret.y = 0.f;
  ret.x = (g_state.pgxp_gpr[rs(instr)].x < tempImm.sw.h) ? 1.f : 0.f;
  ret.SetValid(COMP_Y);
+  ret.flags |= VALID_TAINTED_Z;
  ret.value = BoolToUInt32(static_cast<s32>(rsVal) < imm_sext(instr));

  g_state.pgxp_gpr[rt(instr)] = ret;
@ -887,6 +899,7 @@ void CPU::PGXP::CPU_SLTIU(u32 instr, u32 rsVal)
  ret.y = 0.f;
  ret.x = (f16Unsign(g_state.pgxp_gpr[rs(instr)].x) < tempImm.w.h) ? 1.f : 0.f;
  ret.SetValid(COMP_Y);
+  ret.flags |= VALID_TAINTED_Z;
  ret.value = BoolToUInt32(rsVal < imm(instr));

  g_state.pgxp_gpr[rt(instr)] = ret;
@ -922,10 +935,12 @@ void CPU::PGXP::CPU_ADD(u32 instr, u32 rsVal, u32 rtVal)
  if (rtVal == 0)
  {
    ret = g_state.pgxp_gpr[rs(instr)];
+    CopyZIfMissing(ret, g_state.pgxp_gpr[rt(instr)]);
  }
  else if (rsVal == 0)
  {
    ret = g_state.pgxp_gpr[rt(instr)];
+    CopyZIfMissing(ret, g_state.pgxp_gpr[rs(instr)]);
  }
  else
  {
@ -951,15 +966,8 @@ void CPU::PGXP::CPU_ADD(u32 instr, u32 rsVal, u32 rtVal)
    // truncate on overflow/underflow
    ret.y += (ret.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (ret.y < SHRT_MIN) ? USHRT_MAX + 1 : 0.f;

-    // TODO: decide which "z/w" component to use
-
-    ret.flags &= (g_state.pgxp_gpr[rt(instr)].flags & VALID_XY) | ~VALID_XY;
-  }
-
-  if (!(ret.flags & VALID_Z) && (g_state.pgxp_gpr[rt(instr)].flags & VALID_Z))
-  {
-    ret.z = g_state.pgxp_gpr[rt(instr)].z;
-    ret.SetValid(COMP_Z);
+    SelectZ(ret, ret, g_state.pgxp_gpr[rt(instr)]);
+    ret.flags |= VALID_TAINTED_Z;
  }

  ret.value = rsVal + rtVal;
@ -979,6 +987,7 @@ void CPU::PGXP::CPU_SUB(u32 instr, u32 rsVal, u32 rtVal)
  if (rtVal == 0)
  {
    ret = g_state.pgxp_gpr[rs(instr)];
+    CopyZIfMissing(ret, g_state.pgxp_gpr[rs(instr)]);
  }
  else
  {
@ -1003,16 +1012,11 @@ void CPU::PGXP::CPU_SUB(u32 instr, u32 rsVal, u32 rtVal)
    // truncate on overflow/underflow
    ret.y += (ret.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (ret.y < SHRT_MIN) ? USHRT_MAX + 1 : 0.f;

-    ret.flags &= (g_state.pgxp_gpr[rt(instr)].flags & VALID_XY) | ~VALID_XY;
-
-    ret.value = rsVal - rtVal;
+    SelectZ(ret, ret, g_state.pgxp_gpr[rt(instr)]);
+    ret.flags |= VALID_TAINTED_Z;
  }

-  if (!(ret.flags & VALID_Z) && (g_state.pgxp_gpr[rt(instr)].flags & VALID_Z))
-  {
-    ret.z = g_state.pgxp_gpr[rt(instr)].z;
-    ret.SetValid(COMP_Z);
-  }
+  ret.value = rsVal - rtVal;

  g_state.pgxp_gpr[rd(instr)] = ret;
 }
@ -1040,7 +1044,7 @@ ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_BITWISE(u32 instr, u32 rdVal, u32 rsVa
  valt.d = rtVal;

  PGXP_value ret;
-  ret.flags = VALID_XY;
+  ret.flags = VALID_XY | VALID_TAINTED_Z;

  if (vald.w.l == 0)
  {
@ -1163,6 +1167,7 @@ void CPU::PGXP::CPU_SLT(u32 instr, u32 rsVal, u32 rtVal)
  PGXP_value ret = g_state.pgxp_gpr[rs(instr)];
  ret.y = 0.f;
  ret.SetValid(COMP_Y);
+  ret.flags |= VALID_TAINTED_Z;

  ret.x = (g_state.pgxp_gpr[rs(instr)].y < g_state.pgxp_gpr[rt(instr)].y)                       ? 1.f :
          (f16Unsign(g_state.pgxp_gpr[rs(instr)].x) < f16Unsign(g_state.pgxp_gpr[rt(instr)].x)) ? 1.f :
@ -1191,6 +1196,7 @@ void CPU::PGXP::CPU_SLTU(u32 instr, u32 rsVal, u32 rtVal)
  PGXP_value ret = g_state.pgxp_gpr[rs(instr)];
  ret.y = 0.f;
  ret.SetValid(COMP_Y);
+  ret.flags |= VALID_TAINTED_Z;

  ret.x = (f16Unsign(g_state.pgxp_gpr[rs(instr)].y) < f16Unsign(g_state.pgxp_gpr[rt(instr)].y)) ? 1.f :
          (f16Unsign(g_state.pgxp_gpr[rs(instr)].x) < f16Unsign(g_state.pgxp_gpr[rt(instr)].x)) ? 1.f :
@ -1248,8 +1254,10 @@ void CPU::PGXP::CPU_MULT(u32 instr, u32 rsVal, u32 rtVal)

  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].x = (float)f16Sign(lx);
  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].y = (float)f16Sign(ly);
+  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].flags |= VALID_TAINTED_Z;
  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].x = (float)f16Sign(hx);
  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].y = (float)f16Sign(hy);
+  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].flags |= VALID_TAINTED_Z;

  // compute PSX value
  const u64 result = static_cast<u64>(static_cast<s64>(SignExtend64(rsVal)) * static_cast<s64>(SignExtend64(rtVal)));
@ -1301,8 +1309,10 @@ void CPU::PGXP::CPU_MULTU(u32 instr, u32 rsVal, u32 rtVal)

  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].x = (float)f16Sign(lx);
  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].y = (float)f16Sign(ly);
+  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].flags |= VALID_TAINTED_Z;
  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].x = (float)f16Sign(hx);
  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].y = (float)f16Sign(hy);
+  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].flags |= VALID_TAINTED_Z;

  // compute PSX value
  const u64 result = ZeroExtend64(rsVal) * ZeroExtend64(rtVal);
@ -1339,10 +1349,12 @@ void CPU::PGXP::CPU_DIV(u32 instr, u32 rsVal, u32 rtVal)
  double lo = vs / vt;
  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].y = (float)f16Sign(f16Overflow(lo));
  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].x = (float)f16Sign(lo);
+  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].flags |= VALID_TAINTED_Z;

  double hi = fmod(vs, vt);
  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].y = (float)f16Sign(f16Overflow(hi));
  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].x = (float)f16Sign(hi);
+  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].flags |= VALID_TAINTED_Z;

  // compute PSX value
  if (static_cast<s32>(rtVal) == 0)
@ -1396,10 +1408,12 @@ void CPU::PGXP::CPU_DIVU(u32 instr, u32 rsVal, u32 rtVal)
  double lo = vs / vt;
  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].y = (float)f16Sign(f16Overflow(lo));
  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].x = (float)f16Sign(lo);
+  g_state.pgxp_gpr[static_cast<u8>(Reg::lo)].flags |= VALID_TAINTED_Z;

  double hi = fmod(vs, vt);
  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].y = (float)f16Sign(f16Overflow(hi));
  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].x = (float)f16Sign(hi);
+  g_state.pgxp_gpr[static_cast<u8>(Reg::hi)].flags |= VALID_TAINTED_Z;

  if (rtVal == 0)
  {
@ -1460,6 +1474,7 @@ void CPU::PGXP::CPU_SLL(u32 instr, u32 rtVal)
  prdVal.x = static_cast<float>(x);
  prdVal.y = static_cast<float>(y);
  prdVal.value = rdVal;
+  prdVal.flags |= VALID_TAINTED_Z;
 }

 void CPU::PGXP::CPU_SRL(u32 instr, u32 rtVal)
@ -1525,6 +1540,7 @@ void CPU::PGXP::CPU_SRL(u32 instr, u32 rtVal)
  prdVal.x = static_cast<float>(x);
  prdVal.y = static_cast<float>(y);
  prdVal.value = rdVal;
+  prdVal.flags |= VALID_TAINTED_Z;
 }

 void CPU::PGXP::CPU_SRA(u32 instr, u32 rtVal)
@ -1590,6 +1606,7 @@ void CPU::PGXP::CPU_SRA(u32 instr, u32 rtVal)
  prdVal.x = static_cast<float>(x);
  prdVal.y = static_cast<float>(y);
  prdVal.value = rdVal;
+  prdVal.flags |= VALID_TAINTED_Z;

  // Use low precision/rounded values when we're not shifting an entire component,
  // and it's not originally from a 3D value. Too many false positives in P2/etc.
@ -1649,6 +1666,7 @@ void CPU::PGXP::CPU_SLLV(u32 instr, u32 rtVal, u32 rsVal)
  prdVal.x = static_cast<float>(x);
  prdVal.y = static_cast<float>(y);
  prdVal.value = rdVal;
+  prdVal.flags |= VALID_TAINTED_Z;
 }

 void CPU::PGXP::CPU_SRLV(u32 instr, u32 rtVal, u32 rsVal)
@ -1708,12 +1726,12 @@ void CPU::PGXP::CPU_SRLV(u32 instr, u32 rtVal, u32 rsVal)
  else
    y = y / (1 << sh);

-
  PGXP_value& prdVal = g_state.pgxp_gpr[rd(instr)];
  prdVal = prtVal;
  prdVal.x = static_cast<float>(f16Sign(x));
  prdVal.y = static_cast<float>(f16Sign(y));
  prdVal.value = rdVal;
+  prdVal.flags |= VALID_TAINTED_Z;
 }

 void CPU::PGXP::CPU_SRAV(u32 instr, u32 rtVal, u32 rsVal)
@ -1778,6 +1796,7 @@ void CPU::PGXP::CPU_SRAV(u32 instr, u32 rtVal, u32 rsVal)
  prdVal.x = static_cast<float>(f16Sign(x));
  prdVal.y = static_cast<float>(f16Sign(y));
  prdVal.value = rdVal;
+  prdVal.flags |= VALID_TAINTED_Z;
 }

 void CPU::PGXP::CPU_MFC0(u32 instr, u32 rdVal)