GTE: Fix NCDS

This commit is contained in:
Connor McLaughlin 2019-09-25 15:40:08 +10:00
parent 607cd4d3e4
commit 3df7b22c37
4 changed files with 136 additions and 47 deletions

View file

@ -588,52 +588,78 @@ s64 Core::VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z)
return s64(s32(A[0]) * s32(B_x)) + s64(s32(A[1]) * s32(B_y)) + s64(s32(A[2]) * s32(B_z)); return s64(s32(A[0]) * s32(B_x)) + s64(s32(A[1]) * s32(B_y)) + s64(s32(A[2]) * s32(B_z));
} }
void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm)
{
#define dot3(i) \
TruncateAndSetMAC<i + 1>( \
TruncateMAC<i + 1>(TruncateMAC<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) + \
s64(s32(M[i][2]) * s32(Vz)), \
sf)
dot3(0);
dot3(1);
dot3(2);
#undef dot3
TruncateAndSetIR<1>(m_regs.MAC1, lm);
TruncateAndSetIR<2>(m_regs.MAC2, lm);
TruncateAndSetIR<3>(m_regs.MAC3, lm);
}
void Core::MulMatVec(const s16 M[3][3], const u32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm)
{
#define dot3(i) \
TruncateAndSetMAC<i + 1>(static_cast<s64>(ZeroExtend64(T[i]) << 12) + \
TruncateMAC<i + 1>(TruncateMAC<i + 1>(TruncateMAC<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + \
s64(s32(M[i][1]) * s32(Vy))) + \
s64(s32(M[i][2]) * s32(Vz))), \
sf)
dot3(0);
dot3(1);
dot3(2);
#undef dot3
TruncateAndSetIR<1>(m_regs.MAC1, lm);
TruncateAndSetIR<2>(m_regs.MAC2, lm);
TruncateAndSetIR<3>(m_regs.MAC3, lm);
}
void Core::NCDS(const s16 V[3], bool sf, bool lm) void Core::NCDS(const s16 V[3], bool sf, bool lm)
{ {
const u8 shift = sf ? 12 : 0; const u8 shift = sf ? 12 : 0;
// [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12) // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (LLM*V0) SAR (sf*12)
m_regs.MAC1 = TruncateMAC<1>(VecDot(m_regs.LLM[0], V) >> shift); MulMatVec(m_regs.LLM, V[0], V[1], V[2], sf, lm);
m_regs.MAC2 = TruncateMAC<2>(VecDot(m_regs.LLM[1], V) >> shift);
m_regs.MAC3 = TruncateMAC<3>(VecDot(m_regs.LLM[2], V) >> shift);
SetIR(0, m_regs.MAC1, lm);
SetIR(1, m_regs.MAC2, lm);
SetIR(2, m_regs.MAC3, lm);
// [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12) // [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] = (BK*1000h + LCM*IR) SAR (sf*12)
// TODO: First multiply should check overflow MulMatVec(m_regs.LCM, m_regs.BK, m_regs.IR1, m_regs.IR2, m_regs.IR3, sf, lm);
m_regs.MAC1 = TruncateMAC<1>(
((ZeroExtend64(m_regs.RBK) * 0x1000) + VecDot(m_regs.LCM[0], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift);
m_regs.MAC2 = TruncateMAC<2>(
((ZeroExtend64(m_regs.GBK) * 0x1000) + VecDot(m_regs.LCM[1], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift);
m_regs.MAC3 = TruncateMAC<3>(
((ZeroExtend64(m_regs.BBK) * 0x1000) + VecDot(m_regs.LCM[2], m_regs.IR1, m_regs.IR2, m_regs.IR3)) >> shift);
SetIR(1, m_regs.MAC1, lm);
SetIR(2, m_regs.MAC2, lm);
SetIR(3, m_regs.MAC3, lm);
// [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4 ;<--- for NCDx/NCCx // [MAC1,MAC2,MAC3] = [R*IR1,G*IR2,B*IR3] SHL 4 ;<--- for NCDx/NCCx
m_regs.MAC1 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[0]) * static_cast<u16>(m_regs.IR1)) << 4); TruncateAndSetMAC<1>((s64(ZeroExtend64(m_regs.RGBC[0])) << 4) * s64(m_regs.MAC1), false);
m_regs.MAC2 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[1]) * static_cast<u16>(m_regs.IR2)) << 4); TruncateAndSetMAC<2>((s64(ZeroExtend64(m_regs.RGBC[1])) << 4) * s64(m_regs.MAC2), false);
m_regs.MAC3 = TruncateMAC<1>((ZeroExtend64(m_regs.RGBC[2]) * static_cast<u16>(m_regs.IR3)) << 4); TruncateAndSetMAC<3>((s64(ZeroExtend64(m_regs.RGBC[2])) << 4) * s64(m_regs.MAC3), false);
SetIR(1, m_regs.MAC1, false);
SetIR(2, m_regs.MAC2, false);
SetIR(3, m_regs.MAC3, false);
// [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 ;<--- for NCDx only // [MAC1,MAC2,MAC3] = MAC+(FC-MAC)*IR0 ;<--- for NCDx only
// [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12) ;<--- for NCDx/NCCx // [IR1,IR2,IR3] = (([RFC,GFC,BFC] SHL 12) - [MAC1,MAC2,MAC3]) SAR (sf*12)
m_regs.MAC1 = TruncateMAC<1>(m_regs.MAC1 + ((s32(m_regs.RFC) - m_regs.MAC1) * m_regs.IR0)); TruncateAndSetIR<1>(s32(s64(ZeroExtend64(m_regs.FC[0]) << 12) - s64(m_regs.MAC1)) >> (sf ? 12 : 0), false);
m_regs.MAC2 = TruncateMAC<2>(m_regs.MAC2 + ((s32(m_regs.GFC) - m_regs.MAC2) * m_regs.IR0)); TruncateAndSetIR<2>(s32(s64(ZeroExtend64(m_regs.FC[1]) << 12) - s64(m_regs.MAC2)) >> (sf ? 12 : 0), false);
m_regs.MAC3 = TruncateMAC<3>(m_regs.MAC3 + ((s32(m_regs.BFC) - m_regs.MAC3) * m_regs.IR0)); TruncateAndSetIR<3>(s32(s64(ZeroExtend64(m_regs.FC[2]) << 12) - s64(m_regs.MAC3)) >> (sf ? 12 : 0), false);
// [MAC1,MAC2,MAC3] = (([IR1,IR2,IR3] * IR0) + [MAC1,MAC2,MAC3])
// [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12) ;<--- for NCDx/NCCx // [MAC1,MAC2,MAC3] = [MAC1,MAC2,MAC3] SAR (sf*12) ;<--- for NCDx/NCCx
m_regs.MAC1 >>= shift; TruncateAndSetMAC<1>(s64(s32(m_regs.IR1) * s32(m_regs.IR0)) + s64(m_regs.MAC1), sf);
m_regs.MAC2 >>= shift; TruncateAndSetMAC<2>(s64(s32(m_regs.IR2) * s32(m_regs.IR0)) + s64(m_regs.MAC2), sf);
m_regs.MAC3 >>= shift; TruncateAndSetMAC<3>(s64(s32(m_regs.IR3) * s32(m_regs.IR0)) + s64(m_regs.MAC3), sf);
// Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3] // Color FIFO = [MAC1/16,MAC2/16,MAC3/16,CODE], [IR1,IR2,IR3] = [MAC1,MAC2,MAC3]
PushRGB(TruncateRGB<0>(m_regs.MAC1 / 16), TruncateRGB<1>(m_regs.MAC2 / 16), TruncateRGB<2>(m_regs.MAC3 / 16), PushRGB(TruncateRGB<0>(m_regs.MAC1 / 16), TruncateRGB<1>(m_regs.MAC2 / 16), TruncateRGB<2>(m_regs.MAC3 / 16),
m_regs.RGBC[3]); m_regs.RGBC[3]);
TruncateAndSetIR<1>(m_regs.MAC1, lm);
TruncateAndSetIR<2>(m_regs.MAC2, lm);
TruncateAndSetIR<3>(m_regs.MAC3, lm);
} }
void Core::Execute_NCDS(Instruction inst) void Core::Execute_NCDS(Instruction inst)

View file

@ -26,14 +26,26 @@ public:
void ExecuteInstruction(Instruction inst); void ExecuteInstruction(Instruction inst);
private: private:
static constexpr s64 MAC0_MIN_VALUE = -(INT64_C(1) << 43);
static constexpr s64 MAC0_MAX_VALUE = (INT64_C(1) << 43) - 1;
static constexpr s64 MAC123_MIN_VALUE = -(INT64_C(1) << 43);
static constexpr s64 MAC123_MAX_VALUE = (INT64_C(1) << 43) - 1;
static constexpr s32 IR0_MIN_VALUE = 0x0000;
static constexpr s32 IR0_MAX_VALUE = 0x1000;
static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15);
static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1;
template<u32 index> template<u32 index>
s32 TruncateMAC(s64 value); s64 TruncateMAC(s64 value);
template<u32 index>
s32 TruncateAndSetMAC(s64 value, bool sf);
template<u32 index> template<u32 index>
u8 TruncateRGB(s32 value); u8 TruncateRGB(s32 value);
template<u32 index> template<u32 index>
void SetIR(s32 value, bool lm); s16 TruncateAndSetIR(s32 value, bool lm);
void SetMAC(u32 index, s64 value); void SetMAC(u32 index, s64 value);
void SetIR(u32 index, s32 value, bool lm); void SetIR(u32 index, s32 value, bool lm);
@ -45,8 +57,14 @@ private:
s32 Divide(s32 dividend, s32 divisor); s32 Divide(s32 dividend, s32 divisor);
s32 SaturateDivide(s32 result); s32 SaturateDivide(s32 result);
static s64 VecDot(const s16 A[3], const s16 B[3]); s64 VecDot(const s16 A[3], const s16 B[3]);
static s64 VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z); s64 VecDot(const s16 A[3], s16 B_x, s16 B_y, s16 B_z);
// 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3]
void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm);
// 3x3 matrix * 3x1 vector with translation, updates MAC[1-3] and IR[1-3]
void MulMatVec(const s16 M[3][3], const u32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, bool sf, bool lm);
void RTPS(const s16 V[3], bool sf); void RTPS(const s16 V[3], bool sf);
void NCDS(const s16 V[3], bool sf, bool lm); void NCDS(const s16 V[3], bool sf, bool lm);

View file

@ -19,9 +19,11 @@ u8 GTE::Core::TruncateRGB(s32 value)
} }
template<u32 index> template<u32 index>
s32 GTE::Core::TruncateMAC(s64 value) s64 GTE::Core::TruncateMAC(s64 value)
{ {
if (value < INT64_C(-2147483648)) constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE;
constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE;
if (value < MIN_VALUE)
{ {
if constexpr (index == 0) if constexpr (index == 0)
m_regs.FLAG.mac0_underflow = true; m_regs.FLAG.mac0_underflow = true;
@ -32,9 +34,9 @@ s32 GTE::Core::TruncateMAC(s64 value)
else if constexpr (index == 3) else if constexpr (index == 3)
m_regs.FLAG.mac3_underflow = true; m_regs.FLAG.mac3_underflow = true;
return static_cast<s32>(UINT32_C(0x80000000)); return MIN_VALUE;
} }
else if (value > INT64_C(2147483647)) else if (value > MAX_VALUE)
{ {
if constexpr (index == 0) if constexpr (index == 0)
m_regs.FLAG.mac0_overflow = true; m_regs.FLAG.mac0_overflow = true;
@ -45,13 +47,60 @@ s32 GTE::Core::TruncateMAC(s64 value)
else if constexpr (index == 3) else if constexpr (index == 3)
m_regs.FLAG.mac3_overflow = true; m_regs.FLAG.mac3_overflow = true;
return static_cast<s32>(UINT32_C(0x7FFFFFFF)); return MAX_VALUE;
}
else
{
return value;
} }
return static_cast<s32>(value);
} }
template<u32 index> template<u32 index>
void GTE::Core::SetIR(s32 value, bool lm) s32 GTE::Core::TruncateAndSetMAC(s64 value, bool sf)
{ {
value = TruncateMAC<index>(value);
// shift should be done before storing to avoid losing precision
if (sf)
value >>= 12;
const s32 value32 = static_cast<s32>(value);
m_regs.dr32[24 + index] = value32;
return value32;
}
template<u32 index>
s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm)
{
constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE;
constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE;
const s32 actual_min_value = lm ? 0 : -0x8000;
if (value < actual_min_value)
{
value = actual_min_value;
if constexpr (index == 0)
m_regs.FLAG.ir0_saturated = true;
else if constexpr (index == 1)
m_regs.FLAG.ir1_saturated = true;
else if constexpr (index == 2)
m_regs.FLAG.ir2_saturated = true;
else if constexpr (index == 3)
m_regs.FLAG.ir3_saturated = true;
}
else if (value > MAX_VALUE)
{
value = MAX_VALUE;
if constexpr (index == 0)
m_regs.FLAG.ir0_saturated = true;
else if constexpr (index == 1)
m_regs.FLAG.ir1_saturated = true;
else if constexpr (index == 2)
m_regs.FLAG.ir2_saturated = true;
else if constexpr (index == 3)
m_regs.FLAG.ir3_saturated = true;
}
// store sign-extended 16-bit value as 32-bit
m_regs.dr32[8 + index] = value;
return static_cast<s16>(value);
} }

View file

@ -103,14 +103,10 @@ union Regs
s32 TR[3]; // 37-39 s32 TR[3]; // 37-39
s16 LLM[3][3]; // 40-44 s16 LLM[3][3]; // 40-44
u16 pad18; // 44 u16 pad18; // 44
u32 RBK; // 45 u32 BK[3]; // 45-47
u32 GBK; // 46
u32 BBK; // 47
s16 LCM[3][3]; // 48-52 s16 LCM[3][3]; // 48-52
u16 pad19; // 52 u16 pad19; // 52
u32 RFC; // 53 u32 FC[3]; // 53-55
u32 GFC; // 54
u32 BFC; // 55
s32 OFX; // 56 s32 OFX; // 56
s32 OFY; // 57 s32 OFY; // 57
u16 H; // 58 u16 H; // 58