GPU: Vectorize VRAM fills

This commit is contained in:
Stenzek 2024-08-12 00:20:53 +10:00
parent 5d84185240
commit 9a58327cf2
No known key found for this signature in database
2 changed files with 78 additions and 19 deletions

View file

@ -1491,12 +1491,22 @@ void GPU::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
void GPU::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
{
const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
constexpr u32 vector_width = 8;
const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
if ((x + width) <= VRAM_WIDTH && !IsInterlacedRenderingEnabled())
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
std::fill_n(&g_vram[row * VRAM_WIDTH + x], width, color16);
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
u32 xoffs = 0;
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
GSVector4i::store<false>(row_ptr, fill);
for (; xoffs < width; xoffs++)
*(row_ptr++) = color16;
}
}
else if (IsInterlacedRenderingEnabled())
@ -1506,17 +1516,36 @@ void GPU::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
SynchronizeCRTC();
const u32 active_field = GetActiveLineLSB();
for (u32 yoffs = 0; yoffs < height; yoffs++)
if ((x + width) <= VRAM_WIDTH)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
for (u32 xoffs = 0; xoffs < width; xoffs++)
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 col = (x + xoffs) % VRAM_WIDTH;
row_ptr[col] = color16;
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
u32 xoffs = 0;
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
GSVector4i::store<false>(row_ptr, fill);
for (; xoffs < width; xoffs++)
*(row_ptr++) = color16;
}
}
else
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
for (u32 xoffs = 0; xoffs < width; xoffs++)
{
const u32 col = (x + xoffs) % VRAM_WIDTH;
row_ptr[col] = color16;
}
}
}
}

View file

@ -715,29 +715,59 @@ void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBac
void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
{
const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
constexpr u32 vector_width = 8;
const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
if ((x + width) <= VRAM_WIDTH && !params.interlaced_rendering)
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
std::fill_n(&g_vram[row * VRAM_WIDTH + x], width, color16);
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
u32 xoffs = 0;
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
GSVector4i::store<false>(row_ptr, fill);
for (; xoffs < width; xoffs++)
*(row_ptr++) = color16;
}
}
else if (params.interlaced_rendering)
{
// Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
const u32 active_field = params.active_line_lsb;
for (u32 yoffs = 0; yoffs < height; yoffs++)
if ((x + width) <= VRAM_WIDTH)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
for (u32 xoffs = 0; xoffs < width; xoffs++)
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
u32 xoffs = 0;
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
GSVector4i::store<false>(row_ptr, fill);
for (; xoffs < width; xoffs++)
*(row_ptr++) = color16;
}
}
else
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 col = (x + xoffs) % VRAM_WIDTH;
row_ptr[col] = color16;
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
for (u32 xoffs = 0; xoffs < width; xoffs++)
{
const u32 col = (x + xoffs) % VRAM_WIDTH;
row_ptr[col] = color16;
}
}
}
}