From 2a6e04988f4c6ef97f541d84596fd120dd4d101e Mon Sep 17 00:00:00 2001 From: Connor McLaughlin <stenzek@gmail.com> Date: Thu, 2 Apr 2020 00:17:25 +1000 Subject: [PATCH] GPU/HW: Simplify 24-bit scanout and interlacing shader Has the added bonus of being faster (no downscale copy) and fixes the edge case where the image is cropped in VRAM. --- src/core/gpu.cpp | 4 -- src/core/gpu_hw_d3d11.cpp | 45 ++++++------------- src/core/gpu_hw_opengl.cpp | 69 ++++++++--------------------- src/core/gpu_hw_opengl_es.cpp | 83 ++++++++++++----------------------- src/core/gpu_hw_shadergen.cpp | 59 +++++++------------------ 5 files changed, 75 insertions(+), 185 deletions(-) diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp index bb0567864..fba1d8d11 100644 --- a/src/core/gpu.cpp +++ b/src/core/gpu.cpp @@ -440,10 +440,6 @@ void GPU::UpdateCRTCDisplayParameters() cs.display_vram_left = std::min<u16>( m_crtc_state.regs.X + ((horizontal_display_start_tick - cs.horizontal_display_start) / cs.dot_clock_divider), VRAM_WIDTH - 1); - - // for 24-bit scanout we must stay aligned - if (m_GPUSTAT.display_area_color_depth_24 && ((cs.display_vram_left - cs.regs.X) & 1u)) - cs.display_vram_left--; } if (cs.horizontal_display_end <= horizontal_display_end_tick) diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp index 10250b646..da9a9c2ef 100644 --- a/src/core/gpu_hw_d3d11.cpp +++ b/src/core/gpu_hw_d3d11.cpp @@ -552,43 +552,24 @@ void GPU_HW_D3D11::UpdateDisplay() } else { - const u32 field_offset = BoolToUInt8(interlaced && m_GPUSTAT.interlaced_field); + m_context->OMSetRenderTargets(1, m_display_texture.GetD3DRTVArray(), nullptr); + m_context->PSSetShaderResources(0, 1, m_vram_texture.GetD3DSRVArray()); + const u32 reinterpret_field_offset = + (m_crtc_state.regs.Y + BoolToUInt8(interlaced && m_GPUSTAT.interlaced_field)) & 1u; + const u32 reinterpret_start_x = m_crtc_state.regs.X * m_resolution_scale; + const u32 reinterpret_width = scaled_display_width + (m_crtc_state.display_vram_left - m_crtc_state.regs.X); + const u32 uniforms[4] = {reinterpret_field_offset, reinterpret_start_x}; ID3D11PixelShader* display_pixel_shader = m_display_pixel_shaders[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)][BoolToUInt8(interlaced)].Get(); - // Because of how the reinterpret shader works, we need to use the downscaled version. - if (m_GPUSTAT.display_area_color_depth_24 && m_resolution_scale > 1) - { - const u32 copy_width = std::min<u32>((display_width * 3) / 2, VRAM_WIDTH - vram_offset_x); - const u32 scaled_copy_width = copy_width * m_resolution_scale; - BlitTexture(m_vram_encoding_texture.GetD3DRTV(), vram_offset_x, vram_offset_y, copy_width, display_height, - m_vram_texture.GetD3DSRV(), scaled_vram_offset_x, scaled_vram_offset_y, scaled_copy_width, - scaled_display_height, m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), false); + SetViewportAndScissor(reinterpret_start_x, m_crtc_state.display_vram_top, reinterpret_width, + scaled_display_height); + DrawUtilityShader(display_pixel_shader, uniforms, sizeof(uniforms)); - m_context->OMSetRenderTargets(1, m_display_texture.GetD3DRTVArray(), nullptr); - m_context->PSSetShaderResources(0, 1, m_vram_encoding_texture.GetD3DSRVArray()); - - const u32 uniforms[4] = {vram_offset_x, vram_offset_y, field_offset}; - SetViewportAndScissor(0, field_offset, display_width, display_height); - DrawUtilityShader(display_pixel_shader, uniforms, sizeof(uniforms)); - - m_host_display->SetDisplayTexture(m_display_texture.GetD3DSRV(), m_display_texture.GetWidth(), - m_display_texture.GetHeight(), 0, 0, display_width, display_height); - } - else - { - m_context->OMSetRenderTargets(1, m_display_texture.GetD3DRTVArray(), nullptr); - m_context->PSSetShaderResources(0, 1, m_vram_texture.GetD3DSRVArray()); - - const u32 uniforms[4] = {scaled_vram_offset_x, scaled_vram_offset_y, field_offset}; - SetViewportAndScissor(0, field_offset, scaled_display_width, scaled_display_height); - DrawUtilityShader(display_pixel_shader, uniforms, sizeof(uniforms)); - - m_host_display->SetDisplayTexture(m_display_texture.GetD3DSRV(), m_display_texture.GetWidth(), - m_display_texture.GetHeight(), 0, 0, scaled_display_width, - scaled_display_height); - } + m_host_display->SetDisplayTexture(m_display_texture.GetD3DSRV(), m_display_texture.GetWidth(), + m_display_texture.GetHeight(), scaled_vram_offset_x, scaled_vram_offset_y, + scaled_display_width, scaled_display_height); RestoreGraphicsAPIState(); } diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp index 534c16a08..159e88087 100644 --- a/src/core/gpu_hw_opengl.cpp +++ b/src/core/gpu_hw_opengl.cpp @@ -507,63 +507,32 @@ void GPU_HW_OpenGL::UpdateDisplay() } else { - const u32 flipped_vram_offset_y = VRAM_HEIGHT - vram_offset_y - display_height; - const u32 scaled_flipped_vram_offset_y = - m_vram_texture.GetHeight() - scaled_vram_offset_y - scaled_display_height; - const u32 field_offset = BoolToUInt8(interlaced && m_GPUSTAT.interlaced_field); - glDisable(GL_BLEND); glDisable(GL_SCISSOR_TEST); - const GL::Program& prog = - m_display_programs[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)][BoolToUInt8(interlaced)]; - prog.Bind(); + m_display_programs[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)][BoolToUInt8(interlaced)].Bind(); + m_display_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); + m_vram_texture.Bind(); - // Because of how the reinterpret shader works, we need to use the downscaled version. - if (m_GPUSTAT.display_area_color_depth_24 && m_resolution_scale > 1) - { - const u32 copy_width = std::min<u32>((display_width * 3) / 2, VRAM_WIDTH - vram_offset_x); - const u32 scaled_copy_width = copy_width * m_resolution_scale; - m_vram_encoding_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); - m_vram_texture.BindFramebuffer(GL_READ_FRAMEBUFFER); - glBlitFramebuffer(scaled_vram_offset_x, scaled_flipped_vram_offset_y, scaled_vram_offset_x + scaled_copy_width, - scaled_flipped_vram_offset_y + scaled_display_height, vram_offset_x, flipped_vram_offset_y, - vram_offset_x + copy_width, flipped_vram_offset_y + display_height, GL_COLOR_BUFFER_BIT, - GL_NEAREST); + const u32 flipped_vram_offset_y = VRAM_HEIGHT - vram_offset_y - display_height; + const u32 scaled_flipped_vram_offset_y = + m_vram_texture.GetHeight() - scaled_vram_offset_y - scaled_display_height; - m_display_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); - m_vram_encoding_texture.Bind(); + const u32 reinterpret_field_offset = + (m_crtc_state.regs.Y + BoolToUInt8(interlaced && m_GPUSTAT.interlaced_field)) & 1u; + const u32 reinterpret_start_x = m_crtc_state.regs.X * m_resolution_scale; + const u32 reinterpret_width = scaled_display_width + (m_crtc_state.display_vram_left - m_crtc_state.regs.X); + const u32 uniforms[4] = {reinterpret_field_offset, reinterpret_start_x}; + UploadUniformBlock(uniforms, sizeof(uniforms)); + m_batch_ubo_dirty = true; - glViewport(0, field_offset, display_width, display_height); + glViewport(reinterpret_start_x, scaled_flipped_vram_offset_y, reinterpret_width, scaled_display_height); + glDrawArrays(GL_TRIANGLES, 0, 3); - const u32 uniforms[4] = {vram_offset_x, flipped_vram_offset_y, field_offset}; - UploadUniformBlock(uniforms, sizeof(uniforms)); - m_batch_ubo_dirty = true; - - glDrawArrays(GL_TRIANGLES, 0, 3); - - m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_display_texture.GetGLId())), - m_display_texture.GetWidth(), m_display_texture.GetHeight(), 0, - display_height, display_width, -static_cast<s32>(display_height)); - } - else - { - m_display_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); - m_vram_texture.Bind(); - - glViewport(0, field_offset, scaled_display_width, scaled_display_height); - - const u32 uniforms[4] = {scaled_vram_offset_x, scaled_flipped_vram_offset_y, field_offset}; - UploadUniformBlock(uniforms, sizeof(uniforms)); - m_batch_ubo_dirty = true; - - glDrawArrays(GL_TRIANGLES, 0, 3); - - m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_display_texture.GetGLId())), - m_display_texture.GetWidth(), m_display_texture.GetHeight(), 0, - scaled_display_height, scaled_display_width, - -static_cast<s32>(scaled_display_height)); - } + m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_display_texture.GetGLId())), + m_display_texture.GetWidth(), m_display_texture.GetHeight(), + scaled_vram_offset_x, m_vram_texture.GetHeight() - scaled_vram_offset_y, + scaled_display_width, -static_cast<s32>(scaled_display_height)); // restore state m_vram_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); diff --git a/src/core/gpu_hw_opengl_es.cpp b/src/core/gpu_hw_opengl_es.cpp index 900f2ab88..c1a7c6ded 100644 --- a/src/core/gpu_hw_opengl_es.cpp +++ b/src/core/gpu_hw_opengl_es.cpp @@ -371,75 +371,46 @@ void GPU_HW_OpenGL_ES::UpdateDisplay() } else { - const u32 flipped_vram_offset_y = VRAM_HEIGHT - vram_offset_y - display_height; - const u32 scaled_flipped_vram_offset_y = - m_vram_texture.GetHeight() - scaled_vram_offset_y - scaled_display_height; - const u32 field_offset = BoolToUInt8(interlaced && m_GPUSTAT.interlaced_field); - glDisable(GL_BLEND); glDisable(GL_SCISSOR_TEST); - const GL::Program& prog = + GL::Program& prog = m_display_programs[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)][BoolToUInt8(interlaced)]; prog.Bind(); + m_display_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); + m_vram_texture.Bind(); - // Because of how the reinterpret shader works, we need to use the downscaled version. - if (m_GPUSTAT.display_area_color_depth_24 && m_resolution_scale > 1) - { - const u32 copy_width = std::min<u32>((display_width * 3) / 2, VRAM_WIDTH - vram_offset_x); - const u32 scaled_copy_width = copy_width * m_resolution_scale; - m_vram_encoding_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); - m_vram_texture.BindFramebuffer(GL_READ_FRAMEBUFFER); - glBlitFramebuffer(scaled_vram_offset_x, scaled_flipped_vram_offset_y, scaled_vram_offset_x + scaled_copy_width, - scaled_flipped_vram_offset_y + scaled_display_height, vram_offset_x, flipped_vram_offset_y, - vram_offset_x + copy_width, flipped_vram_offset_y + display_height, GL_COLOR_BUFFER_BIT, - GL_NEAREST); + const u32 flipped_vram_offset_y = VRAM_HEIGHT - vram_offset_y - display_height; + const u32 scaled_flipped_vram_offset_y = + m_vram_texture.GetHeight() - scaled_vram_offset_y - scaled_display_height; - m_display_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); - m_vram_encoding_texture.Bind(); + const u32 reinterpret_field_offset = + (m_crtc_state.regs.Y + BoolToUInt8(interlaced && m_GPUSTAT.interlaced_field)) & 1u; + const u32 reinterpret_start_x = m_crtc_state.regs.X * m_resolution_scale; + const u32 reinterpret_width = scaled_display_width + (m_crtc_state.display_vram_left - m_crtc_state.regs.X); - glViewport(0, field_offset, display_width, display_height); + prog.Uniform2i(0, reinterpret_field_offset, reinterpret_start_x); + m_batch_ubo_dirty = true; - prog.Uniform3i(0, static_cast<s32>(vram_offset_x), static_cast<s32>(flipped_vram_offset_y), - static_cast<s32>(field_offset)); - m_batch_ubo_dirty = true; + glViewport(reinterpret_start_x, scaled_flipped_vram_offset_y, reinterpret_width, scaled_display_height); + glDrawArrays(GL_TRIANGLES, 0, 3); - glDrawArrays(GL_TRIANGLES, 0, 3); - - m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_display_texture.GetGLId())), - m_display_texture.GetWidth(), m_display_texture.GetHeight(), 0, - display_height, display_width, -static_cast<s32>(display_height)); - } - else - { - m_display_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); - m_vram_texture.Bind(); - - glViewport(0, field_offset, scaled_display_width, scaled_display_height); - - prog.Uniform3i(0, static_cast<s32>(scaled_vram_offset_x), static_cast<s32>(scaled_flipped_vram_offset_y), - static_cast<s32>(field_offset)); - m_batch_ubo_dirty = true; - - glDrawArrays(GL_TRIANGLES, 0, 3); - - m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_display_texture.GetGLId())), - m_display_texture.GetWidth(), m_display_texture.GetHeight(), 0, - scaled_display_height, scaled_display_width, - -static_cast<s32>(scaled_display_height)); - } - - // restore state - m_vram_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); - glViewport(0, 0, m_vram_texture.GetWidth(), m_vram_texture.GetHeight()); - glEnable(GL_SCISSOR_TEST); + m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_display_texture.GetGLId())), + m_display_texture.GetWidth(), m_display_texture.GetHeight(), + scaled_vram_offset_x, m_vram_texture.GetHeight() - scaled_vram_offset_y, + scaled_display_width, -static_cast<s32>(scaled_display_height)); } - m_host_display->SetDisplayParameters(m_crtc_state.display_width, m_crtc_state.display_height, - m_crtc_state.display_origin_left, m_crtc_state.display_origin_top, - m_crtc_state.display_vram_width, m_crtc_state.display_vram_height, - m_crtc_state.display_aspect_ratio); + // restore state + m_vram_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); + glViewport(0, 0, m_vram_texture.GetWidth(), m_vram_texture.GetHeight()); + glEnable(GL_SCISSOR_TEST); } + + m_host_display->SetDisplayParameters(m_crtc_state.display_width, m_crtc_state.display_height, + m_crtc_state.display_origin_left, m_crtc_state.display_origin_top, + m_crtc_state.display_vram_width, m_crtc_state.display_vram_height, + m_crtc_state.display_aspect_ratio); } void GPU_HW_OpenGL_ES::ReadVRAM(u32 x, u32 y, u32 width, u32 height) diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index 3bf86d29c..7189ba8fd 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -750,7 +750,7 @@ std::string GPU_HW_ShaderGen::GenerateDisplayFragmentShader(bool depth_24bit, bo DefineMacro(ss, "INTERLACED", interlaced); WriteCommonFunctions(ss); - DeclareUniformBuffer(ss, {"int3 u_base_coords"}); + DeclareUniformBuffer(ss, {"int u_field_offset", "int u_vram_start_x"}); DeclareTexture(ss, "samp0", 0); DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false); @@ -759,55 +759,28 @@ std::string GPU_HW_ShaderGen::GenerateDisplayFragmentShader(bool depth_24bit, bo int2 icoords = int2(v_pos.xy); #if INTERLACED - if (((icoords.y - u_base_coords.z) & 1) != 0) + if (((icoords.y / RESOLUTION_SCALE) & 1) != u_field_offset) discard; #endif #if DEPTH_24BIT - // compute offset in dwords from the start of the 24-bit values - int2 base = int2(u_base_coords.x, u_base_coords.y + icoords.y); - int xoff = int(icoords.x); - int dword_index = (xoff / 2) + (xoff / 4); + // relative to start of scanout + int relative_x = (icoords.x - u_vram_start_x) / RESOLUTION_SCALE; + icoords.x = u_vram_start_x + ((relative_x * 3) / 2) * RESOLUTION_SCALE; - // sample two adjacent dwords, or four 16-bit values as the 24-bit value will lie somewhere between these - uint s0 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, int2(base.x + dword_index * 2 + 0, base.y), 0)); - uint s1 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, int2(base.x + dword_index * 2 + 1, base.y), 0)); - uint s2 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, int2(base.x + (dword_index + 1) * 2 + 0, base.y), 0)); - uint s3 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, int2(base.x + (dword_index + 1) * 2 + 1, base.y), 0)); - - // select the bit for this pixel depending on its offset in the 4-pixel block - uint r, g, b; - int block_offset = xoff & 3; - if (block_offset == 0) - { - r = s0 & 0xFFu; - g = s0 >> 8; - b = s1 & 0xFFu; - } - else if (block_offset == 1) - { - r = s1 >> 8; - g = s2 & 0xFFu; - b = s2 >> 8; - } - else if (block_offset == 2) - { - r = s1 & 0xFFu; - g = s1 >> 8; - b = s2 & 0xFFu; - } - else - { - r = s2 >> 8; - g = s3 & 0xFFu; - b = s3 >> 8; - } - - // and normalize - o_col0 = float4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, 1.0); + // load adjacent 16-bit texels + uint s0 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, icoords, 0)); + uint s1 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, icoords + int2(RESOLUTION_SCALE, 0), 0)); + + // select which part of the combined 16-bit texels we are currently shading + uint s1s0 = ((s1 << 16) | s0) >> ((relative_x & 1) * 8); + + // extract components and normalize + o_col0 = float4(float(s1s0 & 0xFFu) / 255.0, float((s1s0 >> 8u) & 0xFFu) / 255.0, + float((s1s0 >> 16u) & 0xFFu) / 255.0, 1.0); #else // load and return - o_col0 = LOAD_TEXTURE(samp0, u_base_coords.xy + icoords, 0); + o_col0 = LOAD_TEXTURE(samp0, icoords, 0); #endif } )";