GTE: Fix RTPS/RTPT

This commit is contained in:
Connor McLaughlin 2019-10-12 14:49:26 +10:00
parent a55412c24a
commit 3f1fea0e97
3 changed files with 81 additions and 42 deletions

View file

@ -1,8 +1,22 @@
#include "gte.h" #include "gte.h"
#include "YBaseLib/Log.h" #include "YBaseLib/Log.h"
#include <algorithm> #include <algorithm>
#include <array>
Log_SetChannel(GTE); Log_SetChannel(GTE);
// TODO: Optimize, intrinsics?
static inline constexpr u32 CountLeadingZeros(u16 value)
{
u32 count = 0;
for (u32 i = 0; i < 16 && (value & UINT16_C(0x8000)) == 0; i++)
{
count++;
value <<= 1;
}
return count;
}
static inline constexpr u32 CountLeadingBits(u32 value) static inline constexpr u32 CountLeadingBits(u32 value)
{ {
u32 count = 0; u32 count = 0;
@ -379,10 +393,10 @@ void Core::PushSXY(s32 x, s32 y)
m_regs.FLAG.sx2_saturated = true; m_regs.FLAG.sx2_saturated = true;
x = -1024; x = -1024;
} }
else if (x > 32767) else if (x > 1023)
{ {
m_regs.FLAG.sx2_saturated = true; m_regs.FLAG.sx2_saturated = true;
x = 32767; x = 1023;
} }
if (y < -1024) if (y < -1024)
@ -390,10 +404,10 @@ void Core::PushSXY(s32 x, s32 y)
m_regs.FLAG.sy2_saturated = true; m_regs.FLAG.sy2_saturated = true;
y = -1024; y = -1024;
} }
else if (x > 32767) else if (y > 1023)
{ {
m_regs.FLAG.sy2_saturated = true; m_regs.FLAG.sy2_saturated = true;
y = 32767; y = 1023;
} }
m_regs.dr32[12] = m_regs.dr32[13]; // SXY0 <- SXY1 m_regs.dr32[12] = m_regs.dr32[13]; // SXY0 <- SXY1
@ -436,6 +450,50 @@ void Core::PushRGBFromMAC()
m_regs.RGBC[3]); m_regs.RGBC[3]);
} }
u32 Core::UNRDivide(u32 lhs, u32 rhs)
{
if (rhs * 2 <= lhs)
{
m_regs.FLAG.divide_overflow = true;
return 0x1FFFF;
}
const u32 shift = CountLeadingZeros(static_cast<u16>(rhs));
lhs <<= shift;
rhs <<= shift;
static constexpr std::array<u8, 257> unr_table = {{
0xFF, 0xFD, 0xFB, 0xF9, 0xF7, 0xF5, 0xF3, 0xF1, 0xEF, 0xEE, 0xEC, 0xEA, 0xE8, 0xE6, 0xE4, 0xE3, //
0xE1, 0xDF, 0xDD, 0xDC, 0xDA, 0xD8, 0xD6, 0xD5, 0xD3, 0xD1, 0xD0, 0xCE, 0xCD, 0xCB, 0xC9, 0xC8, // 00h..3Fh
0xC6, 0xC5, 0xC3, 0xC1, 0xC0, 0xBE, 0xBD, 0xBB, 0xBA, 0xB8, 0xB7, 0xB5, 0xB4, 0xB2, 0xB1, 0xB0, //
0xAE, 0xAD, 0xAB, 0xAA, 0xA9, 0xA7, 0xA6, 0xA4, 0xA3, 0xA2, 0xA0, 0x9F, 0x9E, 0x9C, 0x9B, 0x9A, //
0x99, 0x97, 0x96, 0x95, 0x94, 0x92, 0x91, 0x90, 0x8F, 0x8D, 0x8C, 0x8B, 0x8A, 0x89, 0x87, 0x86, //
0x85, 0x84, 0x83, 0x82, 0x81, 0x7F, 0x7E, 0x7D, 0x7C, 0x7B, 0x7A, 0x79, 0x78, 0x77, 0x75, 0x74, // 40h..7Fh
0x73, 0x72, 0x71, 0x70, 0x6F, 0x6E, 0x6D, 0x6C, 0x6B, 0x6A, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64, //
0x63, 0x62, 0x61, 0x60, 0x5F, 0x5E, 0x5D, 0x5D, 0x5C, 0x5B, 0x5A, 0x59, 0x58, 0x57, 0x56, 0x55, //
0x54, 0x53, 0x53, 0x52, 0x51, 0x50, 0x4F, 0x4E, 0x4D, 0x4D, 0x4C, 0x4B, 0x4A, 0x49, 0x48, 0x48, //
0x47, 0x46, 0x45, 0x44, 0x43, 0x43, 0x42, 0x41, 0x40, 0x3F, 0x3F, 0x3E, 0x3D, 0x3C, 0x3C, 0x3B, // 80h..BFh
0x3A, 0x39, 0x39, 0x38, 0x37, 0x36, 0x36, 0x35, 0x34, 0x33, 0x33, 0x32, 0x31, 0x31, 0x30, 0x2F, //
0x2E, 0x2E, 0x2D, 0x2C, 0x2C, 0x2B, 0x2A, 0x2A, 0x29, 0x28, 0x28, 0x27, 0x26, 0x26, 0x25, 0x24, //
0x24, 0x23, 0x22, 0x22, 0x21, 0x20, 0x20, 0x1F, 0x1E, 0x1E, 0x1D, 0x1D, 0x1C, 0x1B, 0x1B, 0x1A, //
0x19, 0x19, 0x18, 0x18, 0x17, 0x16, 0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x11, // C0h..FFh
0x10, 0x0F, 0x0F, 0x0E, 0x0E, 0x0D, 0x0D, 0x0C, 0x0C, 0x0B, 0x0A, 0x0A, 0x09, 0x09, 0x08, 0x08, //
0x07, 0x07, 0x06, 0x06, 0x05, 0x05, 0x04, 0x04, 0x03, 0x03, 0x02, 0x02, 0x01, 0x01, 0x00, 0x00, //
0x00 // <-- one extra table entry (for "(d-7FC0h)/80h"=100h)
}};
const u32 divisor = rhs | 0x8000;
const s32 x = static_cast<s32>(0x101 + ZeroExtend32(unr_table[((divisor & 0x7FFF) + 0x40) >> 7]));
const s32 d = ((static_cast<s32>(ZeroExtend32(divisor)) * -x) + 0x80) >> 8;
const u32 recip = static_cast<u32>(((x * (0x20000 + d)) + 0x80) >> 8);
const u32 result = Truncate32((ZeroExtend64(lhs) * ZeroExtend64(recip) + u64(0x8000)) >> 16);
// The min(1FFFFh) limit is needed for cases like FE3Fh/7F20h, F015h/780Bh, etc. (these do produce UNR result 20000h,
// and are saturated to 1FFFFh, but without setting overflow FLAG bits).
return std::min<u32>(0x1FFFF, result);
}
void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm) void Core::MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm)
{ {
#define dot3(i) \ #define dot3(i) \
@ -581,16 +639,13 @@ void Core::Execute_OP(Instruction inst)
m_regs.FLAG.UpdateError(); m_regs.FLAG.UpdateError();
} }
void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last) void Core::RTPS(const s16 V[3], u8 shift, bool lm, bool last)
{ {
const u8 shift = sf ? 12 : 0;
#define dot3(i) \ #define dot3(i) \
SignExtendMACResult<i + 1>( \ SignExtendMACResult<i + 1>( \
(s64(m_regs.TR[i]) << 12) + \ SignExtendMACResult<i + 1>((s64(m_regs.TR[i]) << 12) + (s64(m_regs.RT[i][0]) * s64(V[0]))) + \
SignExtendMACResult<i + 1>( \ (s64(m_regs.RT[i][1]) * s64(V[1]))) + \
SignExtendMACResult<i + 1>(SignExtendMACResult<i + 1>(s64(s32(m_regs.RT[i][0]) * s32(V[0]))) + \ (s64(m_regs.RT[i][2]) * s64(V[2]))
s64(s32(m_regs.RT[i][1]) * s32(V[1]))) + \
s64(s32(m_regs.RT[i][2]) * s32(V[2]))))
// IR1 = MAC1 = (TRX*1000h + RT11*VX0 + RT12*VY0 + RT13*VZ0) SAR (sf*12) // IR1 = MAC1 = (TRX*1000h + RT11*VX0 + RT12*VY0 + RT13*VZ0) SAR (sf*12)
// IR2 = MAC2 = (TRY*1000h + RT21*VX0 + RT22*VY0 + RT23*VZ0) SAR (sf*12) // IR2 = MAC2 = (TRY*1000h + RT21*VX0 + RT22*VY0 + RT23*VZ0) SAR (sf*12)
@ -607,35 +662,20 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last)
// The command does saturate IR1,IR2,IR3 to -8000h..+7FFFh (regardless of lm bit). When using RTP with sf=0, then the // The command does saturate IR1,IR2,IR3 to -8000h..+7FFFh (regardless of lm bit). When using RTP with sf=0, then the
// IR3 saturation flag (FLAG.22) gets set <only> if "MAC3 SAR 12" exceeds -8000h..+7FFFh (although IR3 is saturated // IR3 saturation flag (FLAG.22) gets set <only> if "MAC3 SAR 12" exceeds -8000h..+7FFFh (although IR3 is saturated
// when "MAC3" exceeds -8000h..+7FFFh). // when "MAC3" exceeds -8000h..+7FFFh).
TruncateAndSetIR<3>(m_regs.MAC3, false); TruncateAndSetIR<3>(z >> 12, false);
m_regs.dr32[11] = std::clamp(m_regs.MAC3, lm ? 0 : IR123_MIN_VALUE, IR123_MAX_VALUE); m_regs.dr32[11] = std::clamp(m_regs.MAC3, lm ? 0 : IR123_MIN_VALUE, IR123_MAX_VALUE);
#undef dot3 #undef dot3
// SZ3 = MAC3 SAR ((1-sf)*12) ;ScreenZ FIFO 0..+FFFFh // SZ3 = MAC3 SAR ((1-sf)*12) ;ScreenZ FIFO 0..+FFFFh
PushSZ(s32(z >> 12)); PushSZ(s32(z >> 12));
s32 result;
if (m_regs.SZ3 == 0)
{
// divide by zero
result = 0x1FFFF;
}
else
{
result = s32(((s64(ZeroExtend64(m_regs.H) * 0x20000) / s64(ZeroExtend64(m_regs.SZ3))) + 1) / 2);
if (result > 0x1FFFF)
{
m_regs.FLAG.divide_overflow = true;
result = 0x1FFFF;
}
}
// MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh // MAC0=(((H*20000h/SZ3)+1)/2)*IR1+OFX, SX2=MAC0/10000h ;ScrX FIFO -400h..+3FFh
// MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh // MAC0=(((H*20000h/SZ3)+1)/2)*IR2+OFY, SY2=MAC0/10000h ;ScrY FIFO -400h..+3FFh
const s64 result = static_cast<s64>(ZeroExtend64(UNRDivide(m_regs.H, m_regs.SZ3)));
const s64 Sx = s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX); const s64 Sx = s64(result) * s64(m_regs.IR1) + s64(m_regs.OFX);
const s64 Sy = s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY); const s64 Sy = s64(result) * s64(m_regs.IR2) + s64(m_regs.OFY);
TruncateAndSetMAC<0>(Sx, 0); CheckMACOverflow<0>(Sx);
TruncateAndSetMAC<1>(Sy, 0); CheckMACOverflow<0>(Sy);
PushSXY(s32(Sx >> 16), s32(Sy >> 16)); PushSXY(s32(Sx >> 16), s32(Sy >> 16));
if (last) if (last)
@ -650,7 +690,7 @@ void Core::RTPS(const s16 V[3], bool sf, bool lm, bool last)
void Core::Execute_RTPS(Instruction inst) void Core::Execute_RTPS(Instruction inst)
{ {
m_regs.FLAG.Clear(); m_regs.FLAG.Clear();
RTPS(m_regs.V0, inst.sf, inst.lm, true); RTPS(m_regs.V0, inst.GetShift(), inst.lm, true);
m_regs.FLAG.UpdateError(); m_regs.FLAG.UpdateError();
} }
@ -658,10 +698,12 @@ void Core::Execute_RTPT(Instruction inst)
{ {
m_regs.FLAG.Clear(); m_regs.FLAG.Clear();
const bool sf = inst.sf; const u8 shift = inst.GetShift();
RTPS(m_regs.V0, sf, inst.lm, false); const bool lm = inst.lm;
RTPS(m_regs.V1, sf, inst.lm, false);
RTPS(m_regs.V2, sf, inst.lm, true); RTPS(m_regs.V0, shift, lm, false);
RTPS(m_regs.V1, shift, lm, false);
RTPS(m_regs.V2, shift, lm, true);
m_regs.FLAG.UpdateError(); m_regs.FLAG.UpdateError();
} }

View file

@ -61,6 +61,9 @@ private:
void PushRGB(u8 r, u8 g, u8 b, u8 c); void PushRGB(u8 r, u8 g, u8 b, u8 c);
void PushRGBFromMAC(); void PushRGBFromMAC();
// Divide using Unsigned Newton-Raphson algorithm.
u32 UNRDivide(u32 lhs, u32 rhs);
// 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3] // 3x3 matrix * 3x1 vector, updates MAC[1-3] and IR[1-3]
void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm); void MulMatVec(const s16 M[3][3], const s16 Vx, const s16 Vy, const s16 Vz, u8 shift, bool lm);
@ -70,7 +73,7 @@ private:
// Interpolate colour, or as in nocash "MAC+(FC-MAC)*IR0". // Interpolate colour, or as in nocash "MAC+(FC-MAC)*IR0".
void InterpolateColor(s64 in_MAC1, s64 in_MAC2, s64 in_MAC3, u8 shift, bool lm); void InterpolateColor(s64 in_MAC1, s64 in_MAC2, s64 in_MAC3, u8 shift, bool lm);
void RTPS(const s16 V[3], bool sf, bool lm, bool last); void RTPS(const s16 V[3], u8 shift, bool lm, bool last);
void NCS(const s16 V[3], u8 shift, bool lm); void NCS(const s16 V[3], u8 shift, bool lm);
void NCCS(const s16 V[3], u8 shift, bool lm); void NCCS(const s16 V[3], u8 shift, bool lm);
void NCDS(const s16 V[3], u8 shift, bool lm); void NCDS(const s16 V[3], u8 shift, bool lm);

View file

@ -34,12 +34,6 @@ union FLAGS
static constexpr u32 WRITE_MASK = UINT32_C(0xFFFFF000); static constexpr u32 WRITE_MASK = UINT32_C(0xFFFFF000);
void SetMACOverflow(u32 index) { bits |= (index == 0) ? (UINT32_C(1) << 16) : (UINT32_C(1) << (31 - index)); }
void SetMACUnderflow(u32 index) { bits |= (index == 0) ? (UINT32_C(1) << 15) : (UINT32_C(1) << (27 - index)); }
void SetIRSaturated(u32 index) { bits |= (index == 0) ? (UINT32_C(1) << 12) : (UINT32_C(1) << (25 - index)); }
void Clear() { bits = 0; } void Clear() { bits = 0; }
// Bits 30..23, 18..13 OR'ed // Bits 30..23, 18..13 OR'ed