From b094c9cd879e2a5f029bf4763495d9780f5cb367 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Tue, 2 Jul 2024 19:05:02 +1000 Subject: [PATCH] MDEC: Vectorize IDCT --- src/core/mdec.cpp | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/core/mdec.cpp b/src/core/mdec.cpp index d65b7d993..321ddeb19 100644 --- a/src/core/mdec.cpp +++ b/src/core/mdec.cpp @@ -13,6 +13,7 @@ #include "common/bitfield.h" #include "common/fifo_queue.h" +#include "common/gsvector.h" #include "common/log.h" #include "imgui.h" @@ -141,7 +142,7 @@ static std::array s_iq_y{}; static std::array s_scale_table{}; // blocks, for colour: 0 - Crblk, 1 - Cbblk, 2-5 - Y 1-4 -static std::array, NUM_BLOCKS> s_blocks; +alignas(VECTOR_ALIGNMENT) static std::array, NUM_BLOCKS> s_blocks; static u32 s_current_block = 0; // block (0-5) static u32 s_current_coefficient = 64; // k (in block) static u16 s_current_q_scale = 0; @@ -935,24 +936,17 @@ bool MDEC::DecodeRLE_New(s16* blk, const u8* qt) return false; } -template -static s32 IDCTRow(const BlkType* blk, const s16* idct_matrix) +static s16 IDCTRow(const s16* blk, const s16* idct_matrix) { // IDCT matrix is -32768..32767, block is -16384..16383. 4 adds can happen without overflow. - const s32 sum1 = static_cast(blk[0]) * static_cast(idct_matrix[0]) + - static_cast(blk[1]) * static_cast(idct_matrix[1]) + - static_cast(blk[2]) * static_cast(idct_matrix[2]) + - static_cast(blk[3]) * static_cast(idct_matrix[3]); - const s32 sum2 = static_cast(blk[4]) * static_cast(idct_matrix[4]) + - static_cast(blk[5]) * static_cast(idct_matrix[5]) + - static_cast(blk[6]) * static_cast(idct_matrix[6]) + - static_cast(blk[7]) * static_cast(idct_matrix[7]); - return static_cast(((static_cast(sum1) + static_cast(sum2)) + 0x20000) >> 18); + GSVector4i sum = GSVector4i::load(blk).madd_s16(GSVector4i::load(idct_matrix)).addp_s32(); + return static_cast(((static_cast(sum.extract32<0>()) + static_cast(sum.extract32<1>())) + 0x20000) >> + 18); } void MDEC::IDCT_New(s16* blk) { - std::array temp; + alignas(VECTOR_ALIGNMENT) std::array temp; for (u32 x = 0; x < 8; x++) { for (u32 y = 0; y < 8; y++)