From 8ab46d071308de19be0cebb5b5c140c000959c19 Mon Sep 17 00:00:00 2001
From: Connor McLaughlin <stenzek@gmail.com>
Date: Wed, 14 Dec 2022 17:28:34 +1000
Subject: [PATCH] MDEC: Redo IDCT and conversion

RE2 backgrounds look better now (mainly due to the 24bpp->16bpp
conversion). Also implements signed output (untested).
---
 src/core/mdec.cpp | 59 ++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/src/core/mdec.cpp b/src/core/mdec.cpp
index 3e63f77bb..ffb1092f3 100644
--- a/src/core/mdec.cpp
+++ b/src/core/mdec.cpp
@@ -5,6 +5,7 @@
 #include "common/log.h"
 #include "cpu_core.h"
 #include "dma.h"
+#include "gpu_types.h"
 #include "host.h"
 #include "imgui.h"
 #include "interrupt_controller.h"
@@ -517,22 +518,22 @@ void MDEC::CopyOutBlock()
 
     case DataOutputDepth_15Bit:
     {
-      const u16 a = ZeroExtend16(m_status.data_output_bit15.GetValue());
+      const u32 a = ZeroExtend32(m_status.data_output_bit15.GetValue()) << 15;
       for (u32 i = 0; i < static_cast<u32>(m_block_rgb.size());)
       {
         u32 color = m_block_rgb[i++];
-        u16 r = Truncate16((color >> 3) & 0x1Fu);
-        u16 g = Truncate16((color >> 11) & 0x1Fu);
-        u16 b = Truncate16((color >> 19) & 0x1Fu);
-        const u16 color15a = r | (g << 5) | (b << 10) | (a << 15);
+        u32 r = VRAMConvert8To5(color & 0xFFu);
+        u32 g = VRAMConvert8To5((color >> 8) & 0xFFu);
+        u32 b = VRAMConvert8To5((color >> 16) & 0xFFu);
+        const u32 color15a = r | (g << 5) | (b << 10) | a;
 
         color = m_block_rgb[i++];
-        r = Truncate16((color >> 3) & 0x1Fu);
-        g = Truncate16((color >> 11) & 0x1Fu);
-        b = Truncate16((color >> 19) & 0x1Fu);
-        const u16 color15b = r | (g << 5) | (b << 10) | (a << 15);
+        r = VRAMConvert8To5(color & 0xFFu);
+        g = VRAMConvert8To5((color >> 8) & 0xFFu);
+        b = VRAMConvert8To5((color >> 16) & 0xFFu);
+        const u32 color15b = r | (g << 5) | (b << 10) | a;
 
-        m_data_out_fifo.Push(ZeroExtend32(color15a) | (ZeroExtend32(color15b) << 16));
+        m_data_out_fifo.Push(color15a | (color15b << 16));
       }
     }
     break;
@@ -587,7 +588,7 @@ bool MDEC::rl_decode_block(s16* blk, const u8* qt)
     val = std::clamp(val, -0x400, 0x3FF);
     if (m_current_q_scale > 0)
       blk[zagzig[m_current_coefficient]] = static_cast<s16>(val);
-    else if (m_current_q_scale == 0)
+    else
       blk[m_current_coefficient] = static_cast<s16>(val);
   }
 
@@ -610,7 +611,7 @@ bool MDEC::rl_decode_block(s16* blk, const u8* qt)
       val = std::clamp(val, -0x400, 0x3FF);
       if (m_current_q_scale > 0)
         blk[zagzig[m_current_coefficient]] = static_cast<s16>(val);
-      else if (m_current_q_scale == 0)
+      else
         blk[m_current_coefficient] = static_cast<s16>(val);
     }
 
@@ -626,27 +627,27 @@ bool MDEC::rl_decode_block(s16* blk, const u8* qt)
 
 void MDEC::IDCT(s16* blk)
 {
-  std::array<s64, 64> temp_buffer;
+  std::array<s32, 64> temp;
   for (u32 x = 0; x < 8; x++)
   {
     for (u32 y = 0; y < 8; y++)
     {
-      s64 sum = 0;
-      for (u32 u = 0; u < 8; u++)
-        sum += s32(blk[u * 8 + x]) * s32(m_scale_table[u * 8 + y]);
-      temp_buffer[x + y * 8] = sum;
+      // TODO: We could alter zigzag and invert scale_table to get these in row-major order,
+      // in which case we could do optimize this to a vector multiply.
+      s32 sum = 0;
+      for (u32 z = 0; z < 8; z++)
+        sum += s32(blk[y + z * 8]) * s32(m_scale_table[x + z * 8] / 8);
+      temp[x + y * 8] = static_cast<s32>((sum + 0xfff) / 0x2000);
     }
   }
   for (u32 x = 0; x < 8; x++)
   {
     for (u32 y = 0; y < 8; y++)
     {
-      s64 sum = 0;
-      for (u32 u = 0; u < 8; u++)
-        sum += s64(temp_buffer[u + y * 8]) * s32(m_scale_table[u * 8 + x]);
-
-      blk[x + y * 8] =
-        static_cast<s16>(std::clamp<s32>(SignExtendN<9, s32>((sum >> 32) + ((sum >> 31) & 1)), -128, 127));
+      s32 sum = 0;
+      for (u32 z = 0; z < 8; z++)
+        sum += temp[y + z * 8] * s32(m_scale_table[x + z * 8] / 8);
+      blk[x + y * 8] = static_cast<s16>(std::clamp<s32>((sum + 0xfff) / 0x2000, -128, 127));
     }
   }
 }
@@ -654,6 +655,7 @@ void MDEC::IDCT(s16* blk)
 void MDEC::yuv_to_rgb(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const std::array<s16, 64>& Cbblk,
                       const std::array<s16, 64>& Yblk)
 {
+  const s16 addval = m_status.data_output_signed ? 0 : 0x80;
   for (u32 y = 0; y < 8; y++)
   {
     for (u32 x = 0; x < 8; x++)
@@ -666,14 +668,9 @@ void MDEC::yuv_to_rgb(u32 xx, u32 yy, const std::array<s16, 64>& Crblk, const st
       B = static_cast<s16>(1.772f * static_cast<float>(B));
 
       s16 Y = Yblk[x + y * 8];
-      R = static_cast<s16>(std::clamp(static_cast<int>(Y) + R, -128, 127));
-      G = static_cast<s16>(std::clamp(static_cast<int>(Y) + G, -128, 127));
-      B = static_cast<s16>(std::clamp(static_cast<int>(Y) + B, -128, 127));
-
-      // TODO: Signed output
-      R += 128;
-      G += 128;
-      B += 128;
+      R = static_cast<s16>(std::clamp(static_cast<int>(Y) + R, -128, 127)) + addval;
+      G = static_cast<s16>(std::clamp(static_cast<int>(Y) + G, -128, 127)) + addval;
+      B = static_cast<s16>(std::clamp(static_cast<int>(Y) + B, -128, 127)) + addval;
 
       m_block_rgb[(x + xx) + ((y + yy) * 16)] = ZeroExtend32(static_cast<u16>(R)) |
                                                 (ZeroExtend32(static_cast<u16>(G)) << 8) |