GTE: Fix MVMVA flags due to missing 43-bit-sign-extend

This commit is contained in:
Connor McLaughlin 2019-09-28 15:18:50 +10:00
parent d3893bc9f2
commit 8841934009
4 changed files with 69 additions and 42 deletions

View file

@ -190,6 +190,15 @@ constexpr bool ConvertToBoolUnchecked(TValue value)
return ret; return ret;
} }
// Generic sign extension
template<int NBITS, typename T>
constexpr T SignExtendN(T value)
{
// http://graphics.stanford.edu/~seander/bithacks.html#VariableSignExtend
constexpr int shift = 8 * sizeof(T) - NBITS;
return static_cast<T>((static_cast<std::make_signed_t<T>>(value) << shift) >> shift);
}
// Enum class bitwise operators // Enum class bitwise operators
#define IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(type_) \ #define IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(type_) \
inline constexpr type_ operator&(type_ lhs, type_ rhs) \ inline constexpr type_ operator&(type_ lhs, type_ rhs) \

View file

@ -405,9 +405,10 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last)
{ {
const u8 shift = sf ? 12 : 0; const u8 shift = sf ? 12 : 0;
#define dot3(i) \ #define dot3(i) \
CheckMACResult<i + 1>( \ SignExtendMACResult<i + 1>( \
(s64(m_regs.TR[i]) << 12) + \ (s64(m_regs.TR[i]) << 12) + \
CheckMACResult<i + 1>(CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) + \ SignExtendMACResult<i + 1>( \
SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) + \
s64(s32(m_regs.RT[i][1]) * s32(V[1]))) + \ s64(s32(m_regs.RT[i][1]) * s32(V[1]))) + \
s64(s32(m_regs.RT[i][2]) * s32(V[2])))) s64(s32(m_regs.RT[i][2]) * s32(V[2]))))
@ -451,14 +452,17 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last)
// MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh // MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh
// MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh // MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh
const s64 Sx = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX), 0); const s64 Sx = s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX);
const s64 Sy = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY), 0); const s64 Sy = s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY);
TruncateAndSetMAC<0>(Sx, 0);
TruncateAndSetMAC<1>(Sy, 0);
PushSXY(s32(Sx >> 16), s32(Sy >> 16)); PushSXY(s32(Sx >> 16), s32(Sy >> 16));
if (last) if (last)
{ {
// MAC0=(((H*20000h/SZ3)+1)/2)*DQA+DQB, IR0=MAC0/1000h ;Depth cueing 0..+1000h // MAC0=(((H*20000h/SZ3)+1)/2)*DQA+DQB, IR0=MAC0/1000h ;Depth cueing 0..+1000h
const s64 Sz = TruncateAndSetMAC<0>(s64(result) * s64(m_regs.DQA) + s64(m_regs.DQB), 0); const s64 Sz = s64(result) * s64(m_regs.DQA) + s64(m_regs.DQB);
TruncateAndSetMAC<0>(Sz, 0);
TruncateAndSetIR<0>(s32(Sz >> 12), true); TruncateAndSetIR<0>(s32(Sz >> 12), true);
} }
} }
@ -517,8 +521,7 @@ void Core::Execute_AVSZ3(Instruction inst)
{ {
m_regs.FLAG.Clear(); m_regs.FLAG.Clear();
const s64 result = const s64 result = s64(m_regs.ZSF3) * s32(u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3));
TruncateAndSetMAC<0>(s64(m_regs.ZSF3) * s32(u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)), 0);
TruncateAndSetMAC<0>(result, 0); TruncateAndSetMAC<0>(result, 0);
SetOTZ(s32(result >> 12)); SetOTZ(s32(result >> 12));
@ -529,8 +532,7 @@ void Core::Execute_AVSZ4(Instruction inst)
{ {
m_regs.FLAG.Clear(); m_regs.FLAG.Clear();
const s64 result = TruncateAndSetMAC<0>( const s64 result = s64(m_regs.ZSF4) * s32(u32(m_regs.SZ0) + u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3));
s64(m_regs.ZSF4) * s32(u32(m_regs.SZ0) + u32(m_regs.SZ1) + u32(m_regs.SZ2) + u32(m_regs.SZ3)), 0);
TruncateAndSetMAC<0>(result, 0); TruncateAndSetMAC<0>(result, 0);
SetOTZ(s32(result >> 12)); SetOTZ(s32(result >> 12));
@ -540,41 +542,31 @@ void Core::Execute_AVSZ4(Instruction inst)
void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm) void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
{ {
#define dot3(i) \ #define dot3(i) \
TruncateAndSetMAC<i + 1>( \ TruncateAndSetMACAndIR<i + 1>(SignExtendMACResult<i + 1>((s64(M[i][0]) * s64(Vx)) + (s64(M[i][1]) * s64(Vy))) + \
CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) + \ (s64(M[i][2]) * s64(Vz)), \
s64(s32(M[i][2]) * s32(Vz)), \ shift, lm)
shift)
dot3(0); dot3(0);
dot3(1); dot3(1);
dot3(2); dot3(2);
#undef dot3 #undef dot3
TruncateAndSetIR<1>(m_regs.MAC1, lm);
TruncateAndSetIR<2>(m_regs.MAC2, lm);
TruncateAndSetIR<3>(m_regs.MAC3, lm);
} }
void Core::MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm) void Core::MulMatVec(const s16 M[3][3], const s32 T[3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
{ {
#define dot3(i) \ #define dot3(i) \
TruncateAndSetMAC<i + 1>( \ TruncateAndSetMACAndIR<i + 1>( \
(s64(T[i]) << 12) + \ SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>((s64(T[i]) << 12) + (s64(M[i][0]) * s64(Vx))) + \
CheckMACResult<i + 1>( \ (s64(M[i][1]) * s64(Vy))) + \
CheckMACResult<i + 1>(CheckMACResult<i + 1>(s64(s32(M[i][0]) * s32(Vx))) + s64(s32(M[i][1]) * s32(Vy))) + \ (s64(M[i][2]) * s64(Vz)), \
s64(s32(M[i][2]) * s32(Vz))), \ shift, lm)
shift)
dot3(0); dot3(0);
dot3(1); dot3(1);
dot3(2); dot3(2);
#undef dot3 #undef dot3
TruncateAndSetIR<1>(m_regs.MAC1, lm);
TruncateAndSetIR<2>(m_regs.MAC2, lm);
TruncateAndSetIR<3>(m_regs.MAC3, lm);
} }
void Core::NCCS(const s16 V[3], bool sf, bool lm) void Core::NCCS(const s16 V[3], bool sf, bool lm)

View file

@ -35,15 +35,22 @@ private:
static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15); static constexpr s32 IR123_MIN_VALUE = -(INT64_C(1) << 15);
static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1; static constexpr s32 IR123_MAX_VALUE = (INT64_C(1) << 15) - 1;
// Checks for underflow/overflow. Returns the value untouched so it can be threaded through an expression. // Checks for underflow/overflow.
template<u32 index> template<u32 index>
s64 CheckMACResult(s64 value); void CheckMACOverflow(s64 value);
// Checks for underflow/overflow, sign-extending to 31/43 bits.
template<u32 index>
s64 SignExtendMACResult(s64 value);
template<u32 index> template<u32 index>
s64 TruncateAndSetMAC(s64 value, u8 shift); void TruncateAndSetMAC(s64 value, u8 shift);
template<u32 index> template<u32 index>
s16 TruncateAndSetIR(s32 value, bool lm); void TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm);
template<u32 index>
void TruncateAndSetIR(s32 value, bool lm);
template<u32 index> template<u32 index>
u8 TruncateRGB(s32 value); u8 TruncateRGB(s32 value);

View file

@ -1,7 +1,7 @@
#include "gte.h" #include "gte.h"
template<u32 index> template<u32 index>
s64 GTE::Core::CheckMACResult(s64 value) void GTE::Core::CheckMACOverflow(s64 value)
{ {
constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE; constexpr s64 MIN_VALUE = (index == 0) ? MAC0_MIN_VALUE : MAC123_MIN_VALUE;
constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE; constexpr s64 MAX_VALUE = (index == 0) ? MAC0_MAX_VALUE : MAC123_MAX_VALUE;
@ -27,24 +27,28 @@ s64 GTE::Core::CheckMACResult(s64 value)
else if constexpr (index == 3) else if constexpr (index == 3)
m_regs.FLAG.mac3_overflow = true; m_regs.FLAG.mac3_overflow = true;
} }
return value;
} }
template<u32 index> template<u32 index>
s64 GTE::Core::TruncateAndSetMAC(s64 value, u8 shift) s64 GTE::Core::SignExtendMACResult(s64 value)
{ {
value = CheckMACResult<index>(value); CheckMACOverflow<index>(value);
return SignExtendN < index == 0 ? 31 : 44 > (value);
}
template<u32 index>
void GTE::Core::TruncateAndSetMAC(s64 value, u8 shift)
{
CheckMACOverflow<index>(value);
// shift should be done before storing to avoid losing precision // shift should be done before storing to avoid losing precision
value >>= shift; value >>= shift;
m_regs.dr32[24 + index] = Truncate32(static_cast<u64>(value)); m_regs.dr32[24 + index] = Truncate32(static_cast<u64>(value));
return value;
} }
template<u32 index> template<u32 index>
s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm) void GTE::Core::TruncateAndSetIR(s32 value, bool lm)
{ {
constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE; constexpr s32 MIN_VALUE = (index == 0) ? IR0_MIN_VALUE : IR123_MIN_VALUE;
constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE; constexpr s32 MAX_VALUE = (index == 0) ? IR0_MAX_VALUE : IR123_MAX_VALUE;
@ -76,7 +80,22 @@ s16 GTE::Core::TruncateAndSetIR(s32 value, bool lm)
// store sign-extended 16-bit value as 32-bit // store sign-extended 16-bit value as 32-bit
m_regs.dr32[8 + index] = value; m_regs.dr32[8 + index] = value;
return static_cast<s16>(value); }
template<u32 index>
void GTE::Core::TruncateAndSetMACAndIR(s64 value, u8 shift, bool lm)
{
CheckMACOverflow<index>(value);
// shift should be done before storing to avoid losing precision
value >>= shift;
// set MAC
const s32 value32 = static_cast<s32>(value);
m_regs.dr32[24 + index] = value32;
// set IR
TruncateAndSetIR<index>(value32, lm);
} }
template<u32 index> template<u32 index>