diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp index 2dc6f7eb9..2ca9234a1 100644 --- a/src/core/gpu.cpp +++ b/src/core/gpu.cpp @@ -1041,21 +1041,73 @@ void GPU::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data) void GPU::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) { + // Break up oversized copies. This behavior has not been verified on console. + if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH) + { + u32 remaining_rows = height; + u32 current_src_y = src_y; + u32 current_dst_y = dst_y; + while (remaining_rows > 0) + { + const u32 rows_to_copy = + std::min(remaining_rows, std::min(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y)); + + u32 remaining_columns = width; + u32 current_src_x = src_x; + u32 current_dst_x = dst_x; + while (remaining_columns > 0) + { + const u32 columns_to_copy = + std::min(remaining_columns, std::min(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x)); + CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy); + current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH; + current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH; + remaining_columns -= columns_to_copy; + } + + current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT; + current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT; + remaining_rows -= rows_to_copy; + } + + return; + } + // This doesn't have a fast path, but do we really need one? It's not common. const u16 mask_and = m_GPUSTAT.GetMaskAND(); const u16 mask_or = m_GPUSTAT.GetMaskOR(); - for (u32 row = 0; row < height; row++) + // Copy in reverse when src_x < dst_x, this is verified on console. + if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH)) { - const u16* src_row_ptr = &m_vram_ptr[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - u16* dst_row_ptr = &m_vram_ptr[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; - - for (u32 col = 0; col < width; col++) + for (u32 row = 0; row < height; row++) { - const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH]; - u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH]; - if ((*dst_pixel_ptr & mask_and) == 0) - *dst_pixel_ptr = src_pixel | mask_or; + const u16* src_row_ptr = &m_vram_ptr[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; + u16* dst_row_ptr = &m_vram_ptr[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; + + for (s32 col = static_cast(width - 1); col >= 0; col--) + { + const u16 src_pixel = src_row_ptr[(src_x + static_cast(col)) % VRAM_WIDTH]; + u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast(col)) % VRAM_WIDTH]; + if ((*dst_pixel_ptr & mask_and) == 0) + *dst_pixel_ptr = src_pixel | mask_or; + } + } + } + else + { + for (u32 row = 0; row < height; row++) + { + const u16* src_row_ptr = &m_vram_ptr[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; + u16* dst_row_ptr = &m_vram_ptr[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH]; + + for (u32 col = 0; col < width; col++) + { + const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH]; + u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH]; + if ((*dst_pixel_ptr & mask_and) == 0) + *dst_pixel_ptr = src_pixel | mask_or; + } } } }