GPU/HW: Implement oversized copies on GPU

Fixes slowdown caused by this.
This commit is contained in:
Connor McLaughlin 2020-04-19 22:30:54 +10:00
parent 045c4d1745
commit 5ad133a278
8 changed files with 129 additions and 16 deletions

View file

@ -433,7 +433,7 @@ void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom)
*bottom = std::max<u32>((m_drawing_area.bottom + 1) * m_resolution_scale, *top + 1);
}
Common::Rectangle<u32> GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height)
Common::Rectangle<u32> GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height) const
{
Common::Rectangle<u32> out_rc = Common::Rectangle<u32>::FromExtents(x, y, width, height);
if (out_rc.right > VRAM_WIDTH)
@ -449,6 +449,15 @@ Common::Rectangle<u32> GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u3
return out_rc;
}
bool GPU_HW::UseVRAMCopyShader(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) const
{
// masking enabled, oversized, or overlapping
return (m_GPUSTAT.IsMaskingEnabled() || (src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT ||
(dst_x + width) > VRAM_WIDTH || (dst_y + height) > VRAM_HEIGHT ||
Common::Rectangle<u32>::FromExtents(src_x, src_y, width, height)
.Intersects(Common::Rectangle<u32>::FromExtents(dst_x, dst_y, width, height)));
}
GPU_HW::BatchPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
{
if (rc.primitive == Primitive::Line)

View file

@ -106,6 +106,17 @@ protected:
u32 u_interlaced_displayed_field;
};
struct VRAMCopyUBOData
{
u32 u_src_x;
u32 u_src_y;
u32 u_dst_x;
u32 u_dst_y;
u32 u_width;
u32 u_height;
u32 u_set_mask_bit;
};
struct RendererStats
{
u32 num_batches;
@ -157,7 +168,10 @@ protected:
}
/// Computes the area affected by a VRAM transfer, including wrap-around of X.
Common::Rectangle<u32> GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height);
Common::Rectangle<u32> GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height) const;
/// Returns true if the VRAM copy shader should be used (oversized copies, masking).
bool UseVRAMCopyShader(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) const;
/// Handles quads with flipped texture coordinate directions.
static void HandleFlippedQuadTextureCoordinates(BatchVertex* vertices);

View file

@ -404,6 +404,10 @@ bool GPU_HW_D3D11::CompileShaders()
if (!m_vram_write_pixel_shader)
return false;
m_vram_copy_pixel_shader = m_shader_cache.GetPixelShader(m_device.Get(), shadergen.GenerateVRAMCopyFragmentShader());
if (!m_vram_copy_pixel_shader)
return false;
for (u8 depth_24bit = 0; depth_24bit < 2; depth_24bit++)
{
for (u8 interlacing = 0; interlacing < 2; interlacing++)
@ -689,14 +693,30 @@ void GPU_HW_D3D11::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* d
void GPU_HW_D3D11::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
{
if ((src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT || (dst_x + width) > VRAM_WIDTH ||
(dst_y + height) > VRAM_HEIGHT)
if (UseVRAMCopyShader(src_x, src_y, dst_x, dst_y, width, height))
{
Log_WarningPrintf("Oversized VRAM copy (%u,%u, %u,%u, %u,%u), CPU round trip", src_x, src_y, dst_x, dst_y, width,
height);
ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
GPU::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height);
UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_shadow.data());
const Common::Rectangle<u32> src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height);
const Common::Rectangle<u32> dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height);
if (m_vram_dirty_rect.Intersects(src_bounds))
UpdateVRAMReadTexture();
IncludeVRAMDityRectangle(dst_bounds);
const VRAMCopyUBOData uniforms = {
src_x * m_resolution_scale,
src_y * m_resolution_scale,
dst_x * m_resolution_scale,
dst_y * m_resolution_scale,
width * m_resolution_scale,
height * m_resolution_scale,
m_GPUSTAT.set_mask_while_drawing ? 1u : 0u,
};
const Common::Rectangle<u32> dst_bounds_scaled(dst_bounds * m_resolution_scale);
SetViewportAndScissor(dst_bounds_scaled.left, dst_bounds_scaled.top, dst_bounds_scaled.GetWidth(),
dst_bounds_scaled.GetHeight());
m_context->PSSetShaderResources(0, 1, m_vram_read_texture.GetD3DSRVArray());
DrawUtilityShader(m_vram_copy_pixel_shader.Get(), &uniforms, sizeof(uniforms));
RestoreGraphicsAPIState();
return;
}

View file

@ -113,5 +113,6 @@ private:
ComPtr<ID3D11PixelShader> m_vram_interlaced_fill_pixel_shader;
ComPtr<ID3D11PixelShader> m_vram_read_pixel_shader;
ComPtr<ID3D11PixelShader> m_vram_write_pixel_shader;
ComPtr<ID3D11PixelShader> m_vram_copy_pixel_shader;
std::array<std::array<ComPtr<ID3D11PixelShader>, 2>, 2> m_display_pixel_shaders; // [depth_24][interlaced]
};

View file

@ -431,6 +431,19 @@ bool GPU_HW_OpenGL::CompilePrograms()
prog->Uniform1i("samp0", 0);
m_vram_read_program = std::move(*prog);
prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {},
shadergen.GenerateVRAMCopyFragmentShader(), [this](GL::Program& prog) {
if (!m_is_gles)
prog.BindFragData(0, "o_col0");
});
if (!prog)
return false;
prog->BindUniformBlock("UBOBlock", 1);
prog->Bind();
prog->Uniform1i("samp0", 0);
m_vram_copy_program = std::move(*prog);
if (m_supports_texture_buffer)
{
prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {},
@ -770,14 +783,39 @@ void GPU_HW_OpenGL::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void*
void GPU_HW_OpenGL::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
{
if ((src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT || (dst_x + width) > VRAM_WIDTH ||
(dst_y + height) > VRAM_HEIGHT)
if (UseVRAMCopyShader(src_x, src_y, dst_x, dst_y, width, height))
{
Log_WarningPrintf("Oversized VRAM copy (%u,%u, %u,%u, %u,%u), CPU round trip", src_x, src_y, dst_x, dst_y, width,
height);
ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
GPU::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height);
UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_shadow.data());
const Common::Rectangle<u32> src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height);
const Common::Rectangle<u32> dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height);
if (m_vram_dirty_rect.Intersects(src_bounds))
UpdateVRAMReadTexture();
IncludeVRAMDityRectangle(dst_bounds);
VRAMCopyUBOData uniforms = {
src_x * m_resolution_scale,
src_y * m_resolution_scale,
dst_x * m_resolution_scale,
dst_y * m_resolution_scale,
width * m_resolution_scale,
height * m_resolution_scale,
m_GPUSTAT.set_mask_while_drawing ? 1u : 0u,
};
uniforms.u_src_y = m_vram_texture.GetHeight() - uniforms.u_src_y - uniforms.u_height;
uniforms.u_dst_y = m_vram_texture.GetHeight() - uniforms.u_dst_y - uniforms.u_height;
UploadUniformBuffer(&uniforms, sizeof(uniforms));
glDisable(GL_SCISSOR_TEST);
glDisable(GL_BLEND);
const Common::Rectangle<u32> dst_bounds_scaled(dst_bounds * m_resolution_scale);
glViewport(dst_bounds_scaled.left,
m_vram_texture.GetHeight() - dst_bounds_scaled.top - dst_bounds_scaled.GetHeight(),
dst_bounds_scaled.GetWidth(), dst_bounds_scaled.GetHeight());
m_vram_read_texture.Bind();
m_vram_copy_program.Bind();
glDrawArrays(GL_TRIANGLES, 0, 3);
RestoreGraphicsAPIState();
return;
}

View file

@ -84,6 +84,7 @@ private:
GL::Program m_vram_interlaced_fill_program;
GL::Program m_vram_read_program;
GL::Program m_vram_write_program;
GL::Program m_vram_copy_program;
u32 m_uniform_buffer_alignment = 1;
u32 m_max_texture_buffer_size = 0;

View file

@ -967,3 +967,32 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader()
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader()
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DeclareUniformBuffer(ss, {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_size", "bool u_set_mask_bit"});
DeclareTexture(ss, "samp0", 0);
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false);
ss << R"(
{
uint2 dst_coords = uint2(v_pos.xy);
// find offset from the start of the row/column
uint2 offset;
offset.x = (dst_coords.x < u_dst_coords.x) ? (uint(VRAM_SIZE.x - 1) - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x);
offset.y = (dst_coords.y < u_dst_coords.y) ? (uint(VRAM_SIZE.y - 1) - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y);
// find the source coordinates to copy from
uint2 src_coords = (u_src_coords + offset) % uint2(VRAM_SIZE);
// sample and apply mask bit
float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0);
o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a);
})";
return ss.str();
}

View file

@ -22,6 +22,7 @@ public:
std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);
std::string GenerateVRAMReadFragmentShader();
std::string GenerateVRAMWriteFragmentShader();
std::string GenerateVRAMCopyFragmentShader();
HostDisplay::RenderAPI m_render_api;
u32 m_resolution_scale;