// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "gpu_hw.h" #include "cpu_core.h" #include "gpu_hw_shadergen.h" #include "gpu_sw_backend.h" #include "host.h" #include "pgxp.h" #include "settings.h" #include "system.h" #include "util/imgui_manager.h" #include "util/state_wrapper.h" #include "common/align.h" #include "common/assert.h" #include "common/log.h" #include "common/scoped_guard.h" #include "common/string_util.h" #include "IconsFontAwesome5.h" #include "imgui.h" #include #include #include Log_SetChannel(GPU_HW); // TODO: instead of full state restore, only restore what changed static constexpr GPUTexture::Format VRAM_RT_FORMAT = GPUTexture::Format::RGBA8; static constexpr GPUTexture::Format VRAM_DS_FORMAT = GPUTexture::Format::D16; #ifdef _DEBUG static u32 s_draw_number = 0; #endif template ALWAYS_INLINE static constexpr std::tuple MinMax(T v1, T v2) { if (v1 > v2) return std::tie(v2, v1); else return std::tie(v1, v2); } ALWAYS_INLINE static u32 GetMaxResolutionScale() { return g_gpu_device->GetMaxTextureSize() / VRAM_WIDTH; } ALWAYS_INLINE_RELEASE static u32 GetBoxDownsampleScale() { u32 scale = std::min(g_settings.gpu_resolution_scale, g_settings.gpu_downsample_scale); while ((g_settings.gpu_resolution_scale % scale) != 0) scale--; return scale; } ALWAYS_INLINE static bool ShouldUseUVLimits() { // We only need UV limits if PGXP is enabled, or texture filtering is enabled. return g_settings.gpu_pgxp_enable || g_settings.gpu_texture_filter != GPUTextureFilter::Nearest; } ALWAYS_INLINE static bool ShouldDisableColorPerspective() { return g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_texture_correction && !g_settings.gpu_pgxp_color_correction; } /// Returns true if the specified texture filtering mode requires dual-source blending. ALWAYS_INLINE static bool TextureFilterRequiresDualSourceBlend(GPUTextureFilter filter) { return (filter == GPUTextureFilter::Bilinear || filter == GPUTextureFilter::JINC2 || filter == GPUTextureFilter::xBR); } /// Computes the area affected by a VRAM transfer, including wrap-around of X. static Common::Rectangle GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height) { Common::Rectangle out_rc = Common::Rectangle::FromExtents(x % VRAM_WIDTH, y % VRAM_HEIGHT, width, height); if (out_rc.right > VRAM_WIDTH) { out_rc.left = 0; out_rc.right = VRAM_WIDTH; } if (out_rc.bottom > VRAM_HEIGHT) { out_rc.top = 0; out_rc.bottom = VRAM_HEIGHT; } return out_rc; } namespace { class ShaderCompileProgressTracker { public: ShaderCompileProgressTracker(std::string title, u32 total) : m_title(std::move(title)), m_min_time(Common::Timer::ConvertSecondsToValue(1.0)), m_update_interval(Common::Timer::ConvertSecondsToValue(0.1)), m_start_time(Common::Timer::GetCurrentValue()), m_last_update_time(0), m_progress(0), m_total(total) { } ~ShaderCompileProgressTracker() = default; void Increment() { m_progress++; const u64 tv = Common::Timer::GetCurrentValue(); if ((tv - m_start_time) >= m_min_time && (tv - m_last_update_time) >= m_update_interval) { Host::DisplayLoadingScreen(m_title.c_str(), 0, static_cast(m_total), static_cast(m_progress)); m_last_update_time = tv; } } private: std::string m_title; u64 m_min_time; u64 m_update_interval; u64 m_start_time; u64 m_last_update_time; u32 m_progress; u32 m_total; }; } // namespace GPU_HW::GPU_HW() : GPU() { m_vram_ptr = m_vram_shadow.data(); #ifdef _DEBUG s_draw_number = 0; #endif } GPU_HW::~GPU_HW() { if (m_sw_renderer) { m_sw_renderer->Shutdown(); m_sw_renderer.reset(); } } const Threading::Thread* GPU_HW::GetSWThread() const { return m_sw_renderer ? m_sw_renderer->GetThread() : nullptr; } bool GPU_HW::IsHardwareRenderer() const { return true; } bool GPU_HW::Initialize() { if (!GPU::Initialize()) return false; const GPUDevice::Features features = g_gpu_device->GetFeatures(); m_resolution_scale = CalculateResolutionScale(); m_multisamples = std::min(g_settings.gpu_multisamples, g_gpu_device->GetMaxMultisamples()); m_supports_dual_source_blend = features.dual_source_blend; m_per_sample_shading = g_settings.gpu_per_sample_shading && features.per_sample_shading; m_true_color = g_settings.gpu_true_color; m_scaled_dithering = g_settings.gpu_scaled_dithering; m_texture_filtering = g_settings.gpu_texture_filter; m_using_uv_limits = ShouldUseUVLimits(); m_chroma_smoothing = g_settings.gpu_24bit_chroma_smoothing; m_downsample_mode = GetDownsampleMode(m_resolution_scale); m_wireframe_mode = g_settings.gpu_wireframe_mode; m_disable_color_perspective = features.noperspective_interpolation && ShouldDisableColorPerspective(); CheckSettings(); UpdateSoftwareRenderer(false); PrintSettingsToLog(); if (!CompilePipelines()) { Log_ErrorPrintf("Failed to compile pipelines"); return false; } if (!CreateBuffers()) { Log_ErrorPrintf("Failed to create framebuffer"); return false; } RestoreDeviceContext(); return true; } void GPU_HW::Reset(bool clear_vram) { GPU::Reset(clear_vram); m_batch_current_vertex_ptr = m_batch_start_vertex_ptr; m_vram_shadow.fill(0); if (m_sw_renderer) m_sw_renderer->Reset(clear_vram); m_batch = {}; m_batch_ubo_data = {}; m_batch_ubo_dirty = true; m_current_depth = 1; if (clear_vram) ClearFramebuffer(); } bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_display) { if (!GPU::DoState(sw, host_texture, update_display)) return false; if (host_texture) { GPUTexture* tex = *host_texture; if (sw.IsReading()) { if (tex->GetWidth() != m_vram_texture->GetWidth() || tex->GetHeight() != m_vram_texture->GetHeight() || tex->GetSamples() != m_vram_texture->GetSamples()) { return false; } g_gpu_device->CopyTextureRegion(m_vram_texture.get(), 0, 0, 0, 0, tex, 0, 0, 0, 0, tex->GetWidth(), tex->GetHeight()); } else { if (!tex || tex->GetWidth() != m_vram_texture->GetWidth() || tex->GetHeight() != m_vram_texture->GetHeight() || tex->GetSamples() != m_vram_texture->GetSamples()) { delete tex; tex = g_gpu_device ->CreateTexture(m_vram_texture->GetWidth(), m_vram_texture->GetHeight(), 1, 1, m_vram_texture->GetSamples(), GPUTexture::Type::RenderTarget, GPUTexture::Format::RGBA8, nullptr, 0, false) .release(); *host_texture = tex; if (!tex) return false; } g_gpu_device->CopyTextureRegion(tex, 0, 0, 0, 0, m_vram_texture.get(), 0, 0, 0, 0, tex->GetWidth(), tex->GetHeight()); } } // invalidate the whole VRAM read texture when loading state if (sw.IsReading()) { m_batch_current_vertex_ptr = m_batch_start_vertex_ptr; SetFullVRAMDirtyRectangle(); ResetBatchVertexDepth(); } return true; } void GPU_HW::RestoreDeviceContext() { g_gpu_device->SetTextureSampler(0, m_vram_read_texture.get(), g_gpu_device->GetNearestSampler()); g_gpu_device->SetFramebuffer(m_vram_framebuffer.get()); g_gpu_device->SetViewport(0, 0, m_vram_texture->GetWidth(), m_vram_texture->GetHeight()); SetScissor(); m_batch_ubo_dirty = true; } void GPU_HW::UpdateSettings(const Settings& old_settings) { GPU::UpdateSettings(old_settings); const GPUDevice::Features features = g_gpu_device->GetFeatures(); const u32 resolution_scale = CalculateResolutionScale(); const u32 multisamples = std::min(g_settings.gpu_multisamples, g_gpu_device->GetMaxMultisamples()); const bool per_sample_shading = g_settings.gpu_per_sample_shading && features.noperspective_interpolation; const GPUDownsampleMode downsample_mode = GetDownsampleMode(resolution_scale); const GPUWireframeMode wireframe_mode = features.geometry_shaders ? g_settings.gpu_wireframe_mode : GPUWireframeMode::Disabled; const bool use_uv_limits = ShouldUseUVLimits(); const bool disable_color_perspective = features.noperspective_interpolation && ShouldDisableColorPerspective(); // TODO: Use old_settings const bool framebuffer_changed = (m_resolution_scale != resolution_scale || m_multisamples != multisamples || m_downsample_mode != downsample_mode || (m_downsample_mode == GPUDownsampleMode::Box && g_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale)); const bool shaders_changed = (m_resolution_scale != resolution_scale || m_multisamples != multisamples || m_true_color != g_settings.gpu_true_color || m_per_sample_shading != per_sample_shading || m_scaled_dithering != g_settings.gpu_scaled_dithering || m_texture_filtering != g_settings.gpu_texture_filter || m_using_uv_limits != use_uv_limits || m_chroma_smoothing != g_settings.gpu_24bit_chroma_smoothing || m_downsample_mode != downsample_mode || (m_downsample_mode == GPUDownsampleMode::Box && g_settings.gpu_downsample_scale != old_settings.gpu_downsample_scale) || m_wireframe_mode != wireframe_mode || m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer() || m_disable_color_perspective != disable_color_perspective); if (m_resolution_scale != resolution_scale) { Host::AddKeyedFormattedOSDMessage( "ResolutionScale", 10.0f, TRANSLATE("OSDMessage", "Resolution scale set to %ux (display %ux%u, VRAM %ux%u)"), resolution_scale, m_crtc_state.display_vram_width * resolution_scale, resolution_scale * m_crtc_state.display_vram_height, VRAM_WIDTH * resolution_scale, VRAM_HEIGHT * resolution_scale); } if (m_multisamples != multisamples || m_per_sample_shading != per_sample_shading) { if (per_sample_shading) { Host::AddKeyedFormattedOSDMessage( "Multisampling", 10.0f, TRANSLATE("OSDMessage", "Multisample anti-aliasing set to %ux (SSAA)."), multisamples); } else { Host::AddKeyedFormattedOSDMessage("Multisampling", 10.0f, TRANSLATE("OSDMessage", "Multisample anti-aliasing set to %ux."), multisamples); } } // Back up VRAM if we're recreating the framebuffer. if (framebuffer_changed) { RestoreDeviceContext(); ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT); DestroyBuffers(); } m_resolution_scale = resolution_scale; m_multisamples = multisamples; m_per_sample_shading = per_sample_shading; m_true_color = g_settings.gpu_true_color; m_scaled_dithering = g_settings.gpu_scaled_dithering; m_texture_filtering = g_settings.gpu_texture_filter; m_using_uv_limits = use_uv_limits; m_chroma_smoothing = g_settings.gpu_24bit_chroma_smoothing; m_downsample_mode = downsample_mode; m_wireframe_mode = wireframe_mode; m_disable_color_perspective = disable_color_perspective; CheckSettings(); if (m_pgxp_depth_buffer != g_settings.UsingPGXPDepthBuffer()) { m_pgxp_depth_buffer = g_settings.UsingPGXPDepthBuffer(); m_batch.use_depth_buffer = false; if (m_pgxp_depth_buffer) ClearDepthBuffer(); } UpdateSoftwareRenderer(true); PrintSettingsToLog(); if (shaders_changed) { DestroyPipelines(); if (!CompilePipelines()) Panic("Failed to recompile pipelnes."); } if (framebuffer_changed) { // TODO: weird vram loss when rapidly changing resolutions if (!CreateBuffers()) Panic("Failed to recreate buffers."); RestoreDeviceContext(); UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_ptr, false, false); UpdateDepthBufferFromMaskBit(); UpdateDisplay(); } } void GPU_HW::CheckSettings() { const GPUDevice::Features features = g_gpu_device->GetFeatures(); if (m_multisamples != g_settings.gpu_multisamples) { Host::AddIconOSDMessage("MSAAUnsupported", ICON_FA_PAINT_BRUSH, fmt::format(TRANSLATE_FS("OSDMessage", "{}x MSAA is not supported, using {}x instead."), g_settings.gpu_multisamples, m_multisamples), Host::OSD_CRITICAL_ERROR_DURATION); } else { Host::RemoveKeyedOSDMessage("MSAAUnsupported"); } if (!m_per_sample_shading && g_settings.gpu_per_sample_shading) { Host::AddOSDMessage(TRANSLATE_STR("OSDMessage", "SSAA is not supported, using MSAA instead."), 20.0f); } if (!features.dual_source_blend && TextureFilterRequiresDualSourceBlend(m_texture_filtering)) { Host::AddFormattedOSDMessage( Host::OSD_CRITICAL_ERROR_DURATION, TRANSLATE("OSDMessage", "Texture filter '%s' is not supported with the current renderer."), Settings::GetTextureFilterDisplayName(m_texture_filtering)); m_texture_filtering = GPUTextureFilter::Nearest; } if (!features.noperspective_interpolation && !ShouldDisableColorPerspective()) Log_WarningPrint("Disable color perspective not supported, but should be used."); if (!features.geometry_shaders && m_wireframe_mode != GPUWireframeMode::Disabled) { Host::AddOSDMessage( TRANSLATE("OSDMessage", "Geometry shaders are not supported by your GPU, and are required for wireframe rendering."), Host::OSD_CRITICAL_ERROR_DURATION); m_wireframe_mode = GPUWireframeMode::Disabled; } if (m_downsample_mode == GPUDownsampleMode::Box) { const u32 scale = GetBoxDownsampleScale(); if (scale != g_settings.gpu_downsample_scale || scale == g_settings.gpu_resolution_scale) { Host::AddIconOSDMessage( "BoxDownsampleUnsupported", ICON_FA_PAINT_BRUSH, fmt::format( TRANSLATE_FS("OSDMessage", "Resolution scale {0}x is not divisible by downsample scale {1}x, using {2}x instead."), g_settings.gpu_resolution_scale, g_settings.gpu_downsample_scale, scale), Host::OSD_ERROR_DURATION); } else { Host::RemoveKeyedOSDMessage("BoxDownsampleUnsupported"); } if (scale == g_settings.gpu_resolution_scale) m_downsample_mode = GPUDownsampleMode::Disabled; } m_pgxp_depth_buffer = g_settings.UsingPGXPDepthBuffer(); } u32 GPU_HW::CalculateResolutionScale() const { const u32 max_resolution_scale = GetMaxResolutionScale(); u32 scale; if (g_settings.gpu_resolution_scale != 0) { scale = std::clamp(g_settings.gpu_resolution_scale, 1, max_resolution_scale); } else { // Auto scaling. When the system is starting and all borders crop is enabled, the registers are zero, and // display_height therefore is also zero. Use the default size from the region in this case. const s32 height = (m_crtc_state.display_height != 0) ? static_cast(m_crtc_state.display_height) : (m_console_is_pal ? (PAL_VERTICAL_ACTIVE_END - PAL_VERTICAL_ACTIVE_START) : (NTSC_VERTICAL_ACTIVE_END - NTSC_VERTICAL_ACTIVE_START)); const s32 preferred_scale = static_cast(std::ceil(static_cast(g_gpu_device->GetWindowHeight()) / height)); Log_InfoPrintf("Height = %d, preferred scale = %d", height, preferred_scale); scale = static_cast(std::clamp(preferred_scale, 1, max_resolution_scale)); } if (g_settings.gpu_downsample_mode == GPUDownsampleMode::Adaptive && scale > 1 && !Common::IsPow2(scale)) { const u32 new_scale = Common::PreviousPow2(scale); Log_InfoPrintf("Resolution scale %ux not supported for adaptive smoothing, using %ux", scale, new_scale); if (g_settings.gpu_resolution_scale != 0) { Host::AddFormattedOSDMessage( 10.0f, TRANSLATE("OSDMessage", "Resolution scale %ux not supported for adaptive smoothing, using %ux."), scale, new_scale); } scale = new_scale; } return scale; } void GPU_HW::UpdateResolutionScale() { GPU::UpdateResolutionScale(); if (CalculateResolutionScale() != m_resolution_scale) UpdateSettings(g_settings); } GPUDownsampleMode GPU_HW::GetDownsampleMode(u32 resolution_scale) const { return (resolution_scale == 1) ? GPUDownsampleMode::Disabled : g_settings.gpu_downsample_mode; } bool GPU_HW::IsUsingMultisampling() const { return m_multisamples > 1; } bool GPU_HW::IsUsingDownsampling() const { return (m_downsample_mode != GPUDownsampleMode::Disabled && !m_GPUSTAT.display_area_color_depth_24); } void GPU_HW::SetFullVRAMDirtyRectangle() { m_vram_dirty_rect.Set(0, 0, VRAM_WIDTH, VRAM_HEIGHT); m_draw_mode.SetTexturePageChanged(); } void GPU_HW::ClearVRAMDirtyRectangle() { m_vram_dirty_rect.SetInvalid(); } std::tuple GPU_HW::GetEffectiveDisplayResolution(bool scaled /* = true */) { const u32 scale = scaled ? m_resolution_scale : 1u; return std::make_tuple(m_crtc_state.display_vram_width * scale, m_crtc_state.display_vram_height * scale); } std::tuple GPU_HW::GetFullDisplayResolution(bool scaled /* = true */) { const u32 scale = scaled ? m_resolution_scale : 1u; return std::make_tuple(m_crtc_state.display_width * scale, m_crtc_state.display_height * scale); } void GPU_HW::PrintSettingsToLog() { Log_InfoPrintf("Resolution Scale: %u (%ux%u), maximum %u", m_resolution_scale, VRAM_WIDTH * m_resolution_scale, VRAM_HEIGHT * m_resolution_scale, GetMaxResolutionScale()); Log_InfoPrintf("Multisampling: %ux%s", m_multisamples, m_per_sample_shading ? " (per sample shading)" : ""); Log_InfoPrintf("Dithering: %s%s", m_true_color ? "Disabled" : "Enabled", (!m_true_color && m_scaled_dithering) ? " (Scaled)" : ""); Log_InfoPrintf("Texture Filtering: %s", Settings::GetTextureFilterDisplayName(m_texture_filtering)); Log_InfoPrintf("Dual-source blending: %s", m_supports_dual_source_blend ? "Supported" : "Not supported"); Log_InfoPrintf("Using UV limits: %s", m_using_uv_limits ? "YES" : "NO"); Log_InfoPrintf("Depth buffer: %s", m_pgxp_depth_buffer ? "YES" : "NO"); Log_InfoPrintf("Downsampling: %s", Settings::GetDownsampleModeDisplayName(m_downsample_mode)); Log_InfoPrintf("Wireframe rendering: %s", Settings::GetGPUWireframeModeDisplayName(m_wireframe_mode)); Log_InfoPrintf("Using software renderer for readbacks: %s", m_sw_renderer ? "YES" : "NO"); } bool GPU_HW::CreateBuffers() { DestroyBuffers(); // scale vram size to internal resolution const u32 texture_width = VRAM_WIDTH * m_resolution_scale; const u32 texture_height = VRAM_HEIGHT * m_resolution_scale; const u8 samples = static_cast(m_multisamples); // Needed for Metal resolve. const GPUTexture::Type read_texture_type = (g_gpu_device->GetRenderAPI() == RenderAPI::Metal && m_multisamples > 1) ? GPUTexture::Type::RWTexture : GPUTexture::Type::Texture; if (!(m_vram_texture = g_gpu_device->CreateTexture(texture_width, texture_height, 1, 1, samples, GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT)) || !(m_vram_depth_texture = g_gpu_device->CreateTexture(texture_width, texture_height, 1, 1, samples, GPUTexture::Type::DepthStencil, VRAM_DS_FORMAT)) || !(m_vram_read_texture = g_gpu_device->CreateTexture(texture_width, texture_height, 1, 1, 1, read_texture_type, VRAM_RT_FORMAT)) || !(m_display_private_texture = g_gpu_device->CreateTexture( ((m_downsample_mode == GPUDownsampleMode::Adaptive) ? VRAM_WIDTH : GPU_MAX_DISPLAY_WIDTH) * m_resolution_scale, GPU_MAX_DISPLAY_HEIGHT * m_resolution_scale, 1, 1, 1, GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT)) || !(m_vram_readback_texture = g_gpu_device->CreateTexture(VRAM_WIDTH / 2, VRAM_HEIGHT, 1, 1, 1, GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT))) { return false; } GL_OBJECT_NAME(m_vram_texture, "VRAM Texture"); GL_OBJECT_NAME(m_vram_depth_texture, "VRAM Depth Texture"); GL_OBJECT_NAME(m_vram_read_texture, "VRAM Read Texture"); GL_OBJECT_NAME(m_display_private_texture, "Display Texture"); GL_OBJECT_NAME(m_vram_readback_texture, "VRAM Readback Texture"); // vram framebuffer has both colour and depth if (!(m_vram_framebuffer = g_gpu_device->CreateFramebuffer(m_vram_texture.get(), m_vram_depth_texture.get())) || !(m_vram_update_depth_framebuffer = g_gpu_device->CreateFramebuffer(m_vram_depth_texture.get())) || !(m_vram_readback_framebuffer = g_gpu_device->CreateFramebuffer(m_vram_readback_texture.get())) || !(m_display_framebuffer = g_gpu_device->CreateFramebuffer(m_display_private_texture.get()))) { return false; } GL_OBJECT_NAME(m_vram_framebuffer, "VRAM Framebuffer"); GL_OBJECT_NAME(m_vram_update_depth_framebuffer, "VRAM Update Depth Framebuffer"); GL_OBJECT_NAME(m_vram_readback_framebuffer, "VRAM Readback Framebuffer"); GL_OBJECT_NAME(m_display_framebuffer, "Display Framebuffer"); if (!(m_vram_upload_buffer = g_gpu_device->CreateTextureBuffer(GPUTextureBuffer::Format::R16UI, VRAM_UPDATE_TEXTURE_BUFFER_SIZE / sizeof(u16)))) { return false; } Log_InfoPrintf("Created HW framebuffer of %ux%u", texture_width, texture_height); if (m_downsample_mode == GPUDownsampleMode::Adaptive) { const u32 levels = GetAdaptiveDownsamplingMipLevels(); if (!(m_downsample_texture = g_gpu_device->CreateTexture(texture_width, texture_height, 1, levels, 1, GPUTexture::Type::Texture, VRAM_RT_FORMAT)) || !(m_downsample_render_texture = g_gpu_device->CreateTexture(texture_width, texture_height, 1, 1, 1, GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT)) || !(m_downsample_framebuffer = g_gpu_device->CreateFramebuffer(m_downsample_render_texture.get())) || !(m_downsample_weight_texture = g_gpu_device->CreateTexture(texture_width >> (levels - 1), texture_height >> (levels - 1), 1, 1, 1, GPUTexture::Type::RenderTarget, GPUTexture::Format::R8)) || !(m_downsample_weight_framebuffer = g_gpu_device->CreateFramebuffer(m_downsample_weight_texture.get()))) { return false; } } else if (m_downsample_mode == GPUDownsampleMode::Box) { const u32 downsample_scale = GetBoxDownsampleScale(); if (!(m_downsample_render_texture = g_gpu_device->CreateTexture(VRAM_WIDTH * downsample_scale, VRAM_HEIGHT * downsample_scale, 1, 1, 1, GPUTexture::Type::RenderTarget, VRAM_RT_FORMAT)) || !(m_downsample_framebuffer = g_gpu_device->CreateFramebuffer(m_downsample_render_texture.get()))) { return false; } } g_gpu_device->SetFramebuffer(m_vram_framebuffer.get()); SetFullVRAMDirtyRectangle(); return true; } void GPU_HW::ClearFramebuffer() { g_gpu_device->ClearRenderTarget(m_vram_texture.get(), 0); g_gpu_device->ClearDepth(m_vram_depth_texture.get(), m_pgxp_depth_buffer ? 1.0f : 0.0f); ClearVRAMDirtyRectangle(); g_gpu_device->ClearRenderTarget(m_display_private_texture.get(), 0); m_last_depth_z = 1.0f; } void GPU_HW::DestroyBuffers() { ClearDisplayTexture(); m_vram_upload_buffer.reset(); m_downsample_weight_framebuffer.reset(); m_downsample_weight_texture.reset(); m_downsample_framebuffer.reset(); m_downsample_render_texture.reset(); m_downsample_texture.reset(); m_display_framebuffer.reset(); m_vram_readback_framebuffer.reset(); m_vram_update_depth_framebuffer.reset(); m_vram_framebuffer.reset(); m_vram_read_texture.reset(); m_vram_depth_view.reset(); m_vram_depth_texture.reset(); m_vram_texture.reset(); m_vram_readback_texture.reset(); m_display_private_texture.reset(); } bool GPU_HW::CompilePipelines() { const GPUDevice::Features features = g_gpu_device->GetFeatures(); GPU_HW_ShaderGen shadergen(g_gpu_device->GetRenderAPI(), m_resolution_scale, m_multisamples, m_per_sample_shading, m_true_color, m_scaled_dithering, m_texture_filtering, m_using_uv_limits, m_pgxp_depth_buffer, m_disable_color_perspective, m_supports_dual_source_blend); ShaderCompileProgressTracker progress("Compiling Pipelines", 2 + (4 * 9 * 2 * 2) + (3 * 4 * 5 * 9 * 2 * 2) + 1 + 2 + (2 * 2) + 2 + 1 + 1 + (2 * 3) + 1); // vertex shaders - [textured] // fragment shaders - [render_mode][texture_mode][dithering][interlacing] static constexpr auto destroy_shader = [](std::unique_ptr& s) { s.reset(); }; DimensionalArray, 2> batch_vertex_shaders{}; DimensionalArray, 2, 2, 9, 4> batch_fragment_shaders{}; ScopedGuard batch_shader_guard([&batch_vertex_shaders, &batch_fragment_shaders]() { batch_vertex_shaders.enumerate(destroy_shader); batch_fragment_shaders.enumerate(destroy_shader); }); for (u8 textured = 0; textured < 2; textured++) { const std::string vs = shadergen.GenerateBatchVertexShader(ConvertToBoolUnchecked(textured)); if (!(batch_vertex_shaders[textured] = g_gpu_device->CreateShader(GPUShaderStage::Vertex, vs))) return false; progress.Increment(); } for (u8 render_mode = 0; render_mode < 4; render_mode++) { for (u8 texture_mode = 0; texture_mode < 9; texture_mode++) { for (u8 dithering = 0; dithering < 2; dithering++) { for (u8 interlacing = 0; interlacing < 2; interlacing++) { const std::string fs = shadergen.GenerateBatchFragmentShader( static_cast(render_mode), static_cast(texture_mode), ConvertToBoolUnchecked(dithering), ConvertToBoolUnchecked(interlacing)); if (!(batch_fragment_shaders[render_mode][texture_mode][dithering][interlacing] = g_gpu_device->CreateShader(GPUShaderStage::Fragment, fs))) { return false; } progress.Increment(); } } } } static constexpr GPUPipeline::VertexAttribute vertex_attributes[] = { GPUPipeline::VertexAttribute::Make(0, GPUPipeline::VertexAttribute::Semantic::Position, 0, GPUPipeline::VertexAttribute::Type::Float, 4, offsetof(BatchVertex, x)), GPUPipeline::VertexAttribute::Make(1, GPUPipeline::VertexAttribute::Semantic::Color, 0, GPUPipeline::VertexAttribute::Type::UNorm8, 4, offsetof(BatchVertex, color)), GPUPipeline::VertexAttribute::Make(2, GPUPipeline::VertexAttribute::Semantic::TexCoord, 0, GPUPipeline::VertexAttribute::Type::UInt32, 1, offsetof(BatchVertex, u)), GPUPipeline::VertexAttribute::Make(3, GPUPipeline::VertexAttribute::Semantic::TexCoord, 1, GPUPipeline::VertexAttribute::Type::UInt32, 1, offsetof(BatchVertex, texpage)), GPUPipeline::VertexAttribute::Make(4, GPUPipeline::VertexAttribute::Semantic::TexCoord, 2, GPUPipeline::VertexAttribute::Type::UNorm8, 4, offsetof(BatchVertex, uv_limits)), }; static constexpr u32 NUM_BATCH_VERTEX_ATTRIBUTES = 2; static constexpr u32 NUM_BATCH_TEXTURED_VERTEX_ATTRIBUTES = 4; static constexpr u32 NUM_BATCH_TEXTURED_LIMITS_VERTEX_ATTRIBUTES = 5; GPUPipeline::GraphicsConfig plconfig = {}; plconfig.layout = GPUPipeline::Layout::SingleTextureAndUBO; plconfig.input_layout.vertex_stride = sizeof(BatchVertex); plconfig.rasterization = GPUPipeline::RasterizationState::GetNoCullState(); plconfig.primitive = GPUPipeline::Primitive::Triangles; plconfig.color_format = VRAM_RT_FORMAT; plconfig.depth_format = VRAM_DS_FORMAT; plconfig.samples = m_multisamples; plconfig.per_sample_shading = m_per_sample_shading; plconfig.geometry_shader = nullptr; // [depth_test][render_mode][texture_mode][transparency_mode][dithering][interlacing] for (u8 depth_test = 0; depth_test < 3; depth_test++) { for (u8 render_mode = 0; render_mode < 4; render_mode++) { for (u8 transparency_mode = 0; transparency_mode < 5; transparency_mode++) { for (u8 texture_mode = 0; texture_mode < 9; texture_mode++) { for (u8 dithering = 0; dithering < 2; dithering++) { for (u8 interlacing = 0; interlacing < 2; interlacing++) { static constexpr std::array depth_test_values = { GPUPipeline::DepthFunc::Always, GPUPipeline::DepthFunc::GreaterEqual, GPUPipeline::DepthFunc::LessEqual}; const bool textured = (static_cast(texture_mode) != GPUTextureMode::Disabled); plconfig.input_layout.vertex_attributes = textured ? (m_using_uv_limits ? std::span( vertex_attributes, NUM_BATCH_TEXTURED_LIMITS_VERTEX_ATTRIBUTES) : std::span( vertex_attributes, NUM_BATCH_TEXTURED_VERTEX_ATTRIBUTES)) : std::span(vertex_attributes, NUM_BATCH_VERTEX_ATTRIBUTES); plconfig.vertex_shader = batch_vertex_shaders[BoolToUInt8(textured)].get(); plconfig.fragment_shader = batch_fragment_shaders[render_mode][texture_mode][dithering][interlacing].get(); plconfig.depth.depth_test = depth_test_values[depth_test]; plconfig.depth.depth_write = !m_pgxp_depth_buffer || depth_test != 0; plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState(); if ((static_cast(transparency_mode) != GPUTransparencyMode::Disabled && (static_cast(render_mode) != BatchRenderMode::TransparencyDisabled && static_cast(render_mode) != BatchRenderMode::OnlyOpaque)) || m_texture_filtering != GPUTextureFilter::Nearest) { plconfig.blend.enable = true; plconfig.blend.src_alpha_blend = GPUPipeline::BlendFunc::One; plconfig.blend.dst_alpha_blend = GPUPipeline::BlendFunc::Zero; plconfig.blend.alpha_blend_op = GPUPipeline::BlendOp::Add; if (m_supports_dual_source_blend) { plconfig.blend.src_blend = GPUPipeline::BlendFunc::One; plconfig.blend.dst_blend = GPUPipeline::BlendFunc::SrcAlpha1; plconfig.blend.blend_op = (static_cast(transparency_mode) == GPUTransparencyMode::BackgroundMinusForeground && static_cast(render_mode) != BatchRenderMode::TransparencyDisabled && static_cast(render_mode) != BatchRenderMode::OnlyOpaque) ? GPUPipeline::BlendOp::ReverseSubtract : GPUPipeline::BlendOp::Add; } else { const u32 factor = (static_cast(transparency_mode) == GPUTransparencyMode::HalfBackgroundPlusHalfForeground) ? 0xFF808080u : 0xFFFFFFFFu; plconfig.blend.src_blend = GPUPipeline::BlendFunc::One; plconfig.blend.dst_blend = GPUPipeline::BlendFunc::ConstantColor; plconfig.blend.blend_op = (static_cast(transparency_mode) == GPUTransparencyMode::BackgroundMinusForeground && static_cast(render_mode) != BatchRenderMode::TransparencyDisabled && static_cast(render_mode) != BatchRenderMode::OnlyOpaque) ? GPUPipeline::BlendOp::ReverseSubtract : GPUPipeline::BlendOp::Add; plconfig.blend.constant = factor; } } if (!(m_batch_pipelines[depth_test][render_mode][texture_mode][transparency_mode][dithering] [interlacing] = g_gpu_device->CreatePipeline(plconfig))) { return false; } progress.Increment(); } } } } } } if (m_wireframe_mode != GPUWireframeMode::Disabled) { std::unique_ptr gs = g_gpu_device->CreateShader(GPUShaderStage::Geometry, shadergen.GenerateWireframeGeometryShader()); std::unique_ptr fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateWireframeFragmentShader()); if (!gs || !fs) return false; GL_OBJECT_NAME(gs, "Batch Wireframe Geometry Shader"); GL_OBJECT_NAME(fs, "Batch Wireframe Fragment Shader"); plconfig.input_layout.vertex_attributes = std::span(vertex_attributes, NUM_BATCH_VERTEX_ATTRIBUTES); plconfig.blend = (m_wireframe_mode == GPUWireframeMode::OverlayWireframe) ? GPUPipeline::BlendState::GetAlphaBlendingState() : GPUPipeline::BlendState::GetNoBlendingState(); plconfig.blend.write_mask = 0x7; plconfig.depth = GPUPipeline::DepthState::GetNoTestsState(); plconfig.vertex_shader = batch_vertex_shaders[0].get(); plconfig.geometry_shader = gs.get(); plconfig.fragment_shader = fs.get(); if (!(m_wireframe_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_wireframe_pipeline, "Batch Wireframe Pipeline"); plconfig.vertex_shader = nullptr; plconfig.geometry_shader = nullptr; plconfig.fragment_shader = nullptr; } batch_shader_guard.Run(); std::unique_ptr fullscreen_quad_vertex_shader = g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GenerateScreenQuadVertexShader()); if (!fullscreen_quad_vertex_shader) return false; progress.Increment(); // common state plconfig.input_layout.vertex_attributes = {}; plconfig.input_layout.vertex_stride = 0; plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants; plconfig.per_sample_shading = false; plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState(); plconfig.vertex_shader = fullscreen_quad_vertex_shader.get(); // VRAM fill for (u8 wrapped = 0; wrapped < 2; wrapped++) { for (u8 interlaced = 0; interlaced < 2; interlaced++) { std::unique_ptr fs = g_gpu_device->CreateShader( GPUShaderStage::Fragment, shadergen.GenerateVRAMFillFragmentShader(ConvertToBoolUnchecked(wrapped), ConvertToBoolUnchecked(interlaced))); if (!fs) return false; plconfig.fragment_shader = fs.get(); plconfig.depth = GPUPipeline::DepthState::GetAlwaysWriteState(); if (!(m_vram_fill_pipelines[wrapped][interlaced] = g_gpu_device->CreatePipeline(plconfig))) return false; progress.Increment(); } } // VRAM copy { std::unique_ptr fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateVRAMCopyFragmentShader()); if (!fs) return false; plconfig.fragment_shader = fs.get(); for (u8 depth_test = 0; depth_test < 2; depth_test++) { plconfig.depth.depth_write = true; plconfig.depth.depth_test = (depth_test != 0) ? GPUPipeline::DepthFunc::GreaterEqual : GPUPipeline::DepthFunc::Always; if (!(m_vram_copy_pipelines[depth_test] = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_vram_copy_pipelines[depth_test], "VRAM Write Pipeline, depth=%u", depth_test); progress.Increment(); } } // VRAM write { const bool use_ssbo = features.texture_buffers_emulated_with_ssbo; std::unique_ptr fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateVRAMWriteFragmentShader(use_ssbo)); if (!fs) return false; plconfig.layout = GPUPipeline::Layout::SingleTextureBufferAndPushConstants; plconfig.fragment_shader = fs.get(); for (u8 depth_test = 0; depth_test < 2; depth_test++) { plconfig.depth.depth_write = true; plconfig.depth.depth_test = (depth_test != 0) ? GPUPipeline::DepthFunc::GreaterEqual : GPUPipeline::DepthFunc::Always; if (!(m_vram_write_pipelines[depth_test] = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_vram_write_pipelines[depth_test], "VRAM Write Pipeline, depth=%u", depth_test); progress.Increment(); } } plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants; // VRAM update depth { std::unique_ptr fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateVRAMUpdateDepthFragmentShader()); if (!fs) return false; plconfig.fragment_shader = fs.get(); plconfig.color_format = GPUTexture::Format::Unknown; plconfig.depth_format = VRAM_DS_FORMAT; plconfig.depth = GPUPipeline::DepthState::GetAlwaysWriteState(); plconfig.blend.write_mask = 0; if (!(m_vram_update_depth_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_vram_update_depth_pipeline, "VRAM Update Depth Pipeline"); progress.Increment(); } plconfig.color_format = VRAM_RT_FORMAT; plconfig.depth_format = GPUTexture::Format::Unknown; plconfig.depth = GPUPipeline::DepthState::GetNoTestsState(); plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState(); plconfig.samples = 1; plconfig.per_sample_shading = false; // VRAM read { std::unique_ptr fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateVRAMReadFragmentShader()); if (!fs) return false; plconfig.fragment_shader = fs.get(); if (!(m_vram_readback_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_vram_readback_pipeline, "VRAM Read Pipeline"); progress.Increment(); } // Display { for (u8 depth_24 = 0; depth_24 < 2; depth_24++) { for (u8 interlace_mode = 0; interlace_mode < 3; interlace_mode++) { std::unique_ptr fs = g_gpu_device->CreateShader( GPUShaderStage::Fragment, shadergen.GenerateDisplayFragmentShader( ConvertToBoolUnchecked(depth_24), static_cast(interlace_mode), m_chroma_smoothing)); if (!fs) return false; plconfig.fragment_shader = fs.get(); if (!(m_display_pipelines[depth_24][interlace_mode] = g_gpu_device->CreatePipeline(plconfig))) return false; progress.Increment(); } } } { std::unique_ptr fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateCopyFragmentShader()); if (!fs) return false; plconfig.fragment_shader = fs.get(); if (!(m_copy_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; } if (m_downsample_mode == GPUDownsampleMode::Adaptive) { std::unique_ptr vs = g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GenerateAdaptiveDownsampleVertexShader()); std::unique_ptr fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateAdaptiveDownsampleMipFragmentShader(true)); if (!vs || !fs) return false; GL_OBJECT_NAME(fs, "Downsample Vertex Shader"); GL_OBJECT_NAME(fs, "Downsample First Pass Fragment Shader"); plconfig.vertex_shader = vs.get(); plconfig.fragment_shader = fs.get(); if (!(m_downsample_first_pass_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_downsample_first_pass_pipeline, "Downsample First Pass Pipeline"); fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateAdaptiveDownsampleMipFragmentShader(false)); if (!fs) return false; GL_OBJECT_NAME(fs, "Downsample Mid Pass Fragment Shader"); plconfig.fragment_shader = fs.get(); if (!(m_downsample_mid_pass_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_downsample_mid_pass_pipeline, "Downsample Mid Pass Pipeline"); fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateAdaptiveDownsampleBlurFragmentShader()); if (!fs) return false; GL_OBJECT_NAME(fs, "Downsample Blur Pass Fragment Shader"); plconfig.fragment_shader = fs.get(); plconfig.color_format = GPUTexture::Format::R8; if (!(m_downsample_blur_pass_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_downsample_blur_pass_pipeline, "Downsample Blur Pass Pipeline"); fs = g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GenerateAdaptiveDownsampleCompositeFragmentShader()); if (!fs) return false; GL_OBJECT_NAME(fs, "Downsample Composite Pass Fragment Shader"); plconfig.layout = GPUPipeline::Layout::MultiTextureAndPushConstants; plconfig.fragment_shader = fs.get(); plconfig.color_format = VRAM_RT_FORMAT; if (!(m_downsample_composite_pass_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_downsample_composite_pass_pipeline, "Downsample Blur Pass Pipeline"); GPUSampler::Config config = GPUSampler::GetLinearConfig(); config.min_lod = 0; config.max_lod = GPUSampler::Config::LOD_MAX; if (!(m_downsample_lod_sampler = g_gpu_device->CreateSampler(config))) return false; GL_OBJECT_NAME(m_downsample_lod_sampler, "Downsample LOD Sampler"); config.mip_filter = GPUSampler::Filter::Linear; if (!(m_downsample_composite_sampler = g_gpu_device->CreateSampler(config))) return false; GL_OBJECT_NAME(m_downsample_composite_sampler, "Downsample Trilinear Sampler"); } else if (m_downsample_mode == GPUDownsampleMode::Box) { std::unique_ptr fs = g_gpu_device->CreateShader( GPUShaderStage::Fragment, shadergen.GenerateBoxSampleDownsampleFragmentShader(m_resolution_scale / GetBoxDownsampleScale())); if (!fs) return false; GL_OBJECT_NAME(fs, "Downsample First Pass Fragment Shader"); plconfig.fragment_shader = fs.get(); if (!(m_downsample_first_pass_pipeline = g_gpu_device->CreatePipeline(plconfig))) return false; GL_OBJECT_NAME(m_downsample_first_pass_pipeline, "Downsample First Pass Pipeline"); } progress.Increment(); #undef UPDATE_PROGRESS return true; } void GPU_HW::DestroyPipelines() { static constexpr auto destroy = [](std::unique_ptr& p) { p.reset(); }; m_wireframe_pipeline.reset(); m_batch_pipelines.enumerate(destroy); m_vram_fill_pipelines.enumerate(destroy); for (std::unique_ptr& p : m_vram_write_pipelines) destroy(p); for (std::unique_ptr& p : m_vram_copy_pipelines) destroy(p); destroy(m_vram_readback_pipeline); destroy(m_vram_update_depth_pipeline); destroy(m_downsample_first_pass_pipeline); destroy(m_downsample_mid_pass_pipeline); destroy(m_downsample_blur_pass_pipeline); destroy(m_downsample_composite_pass_pipeline); m_downsample_composite_sampler.reset(); m_copy_pipeline.reset(); m_display_pipelines.enumerate(destroy); } void GPU_HW::UpdateVRAMReadTexture() { GL_SCOPE("UpdateVRAMReadTexture()"); const auto scaled_rect = m_vram_dirty_rect * m_resolution_scale; if (m_vram_texture->IsMultisampled()) { if (g_gpu_device->GetFeatures().partial_msaa_resolve) { g_gpu_device->ResolveTextureRegion(m_vram_read_texture.get(), scaled_rect.left, scaled_rect.top, 0, 0, m_vram_texture.get(), scaled_rect.left, scaled_rect.top, scaled_rect.GetWidth(), scaled_rect.GetHeight()); } else { g_gpu_device->ResolveTextureRegion(m_vram_read_texture.get(), 0, 0, 0, 0, m_vram_texture.get(), 0, 0, m_vram_texture->GetWidth(), m_vram_texture->GetHeight()); } } else { g_gpu_device->CopyTextureRegion(m_vram_read_texture.get(), scaled_rect.left, scaled_rect.top, 0, 0, m_vram_texture.get(), scaled_rect.left, scaled_rect.top, 0, 0, scaled_rect.GetWidth(), scaled_rect.GetHeight()); } m_renderer_stats.num_vram_read_texture_updates++; ClearVRAMDirtyRectangle(); } void GPU_HW::UpdateDepthBufferFromMaskBit() { if (m_pgxp_depth_buffer) return; // Viewport should already be set full, only need to fudge the scissor. g_gpu_device->SetScissor(0, 0, m_vram_texture->GetWidth(), m_vram_texture->GetHeight()); g_gpu_device->SetFramebuffer(m_vram_update_depth_framebuffer.get()); g_gpu_device->SetPipeline(m_vram_update_depth_pipeline.get()); g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler()); g_gpu_device->Draw(3, 0); // Restore. g_gpu_device->SetTextureSampler(0, m_vram_read_texture.get(), g_gpu_device->GetNearestSampler()); g_gpu_device->SetFramebuffer(m_vram_framebuffer.get()); SetScissor(); } void GPU_HW::ClearDepthBuffer() { DebugAssert(m_pgxp_depth_buffer); g_gpu_device->ClearDepth(m_vram_depth_texture.get(), 1.0f); m_last_depth_z = 1.0f; } void GPU_HW::SetScissor() { const s32 left = m_drawing_area.left * m_resolution_scale; const s32 right = std::max((m_drawing_area.right + 1) * m_resolution_scale, left + 1); const s32 top = m_drawing_area.top * m_resolution_scale; const s32 bottom = std::max((m_drawing_area.bottom + 1) * m_resolution_scale, top + 1); g_gpu_device->SetScissor(left, top, right - left, bottom - top); } void GPU_HW::MapBatchVertexPointer(u32 required_vertices) { DebugAssert(!m_batch_start_vertex_ptr); void* map; u32 space; g_gpu_device->MapVertexBuffer(sizeof(BatchVertex), required_vertices, &map, &space, &m_batch_base_vertex); m_batch_start_vertex_ptr = static_cast(map); m_batch_current_vertex_ptr = m_batch_start_vertex_ptr; m_batch_end_vertex_ptr = m_batch_start_vertex_ptr + space; } void GPU_HW::UnmapBatchVertexPointer(u32 used_vertices) { DebugAssert(m_batch_start_vertex_ptr); g_gpu_device->UnmapVertexBuffer(sizeof(BatchVertex), used_vertices); m_batch_start_vertex_ptr = nullptr; m_batch_end_vertex_ptr = nullptr; m_batch_current_vertex_ptr = nullptr; } void GPU_HW::DrawBatchVertices(BatchRenderMode render_mode, u32 num_vertices, u32 base_vertex) { // [depth_test][render_mode][texture_mode][transparency_mode][dithering][interlacing] const u8 depth_test = m_batch.use_depth_buffer ? static_cast(2) : BoolToUInt8(m_batch.check_mask_before_draw); g_gpu_device->SetPipeline( m_batch_pipelines[depth_test][static_cast(render_mode)][static_cast(m_batch.texture_mode)][static_cast( m_batch.transparency_mode)][BoolToUInt8(m_batch.dithering)][BoolToUInt8(m_batch.interlacing)] .get()); g_gpu_device->Draw(num_vertices, base_vertex); } void GPU_HW::ClearDisplay() { ClearDisplayTexture(); g_gpu_device->ClearRenderTarget(m_display_private_texture.get(), 0xFF000000u); } void GPU_HW::HandleFlippedQuadTextureCoordinates(BatchVertex* vertices) { // Taken from beetle-psx gpu_polygon.cpp // For X/Y flipped 2D sprites, PSX games rely on a very specific rasterization behavior. If U or V is decreasing in X // or Y, and we use the provided U/V as is, we will sample the wrong texel as interpolation covers an entire pixel, // while PSX samples its interpolation essentially in the top-left corner and splats that interpolant across the // entire pixel. While we could emulate this reasonably well in native resolution by shifting our vertex coords by // 0.5, this breaks in upscaling scenarios, because we have several samples per native sample and we need NN rules to // hit the same UV every time. One approach here is to use interpolate at offset or similar tricks to generalize the // PSX interpolation patterns, but the problem is that vertices sharing an edge will no longer see the same UV (due to // different plane derivatives), we end up sampling outside the intended boundary and artifacts are inevitable, so the // only case where we can apply this fixup is for "sprites" or similar which should not share edges, which leads to // this unfortunate code below. // It might be faster to do more direct checking here, but the code below handles primitives in any order and // orientation, and is far more SIMD-friendly if needed. const float abx = vertices[1].x - vertices[0].x; const float aby = vertices[1].y - vertices[0].y; const float bcx = vertices[2].x - vertices[1].x; const float bcy = vertices[2].y - vertices[1].y; const float cax = vertices[0].x - vertices[2].x; const float cay = vertices[0].y - vertices[2].y; // Compute static derivatives, just assume W is uniform across the primitive and that the plane equation remains the // same across the quad. (which it is, there is no Z.. yet). const float dudx = -aby * static_cast(vertices[2].u) - bcy * static_cast(vertices[0].u) - cay * static_cast(vertices[1].u); const float dvdx = -aby * static_cast(vertices[2].v) - bcy * static_cast(vertices[0].v) - cay * static_cast(vertices[1].v); const float dudy = +abx * static_cast(vertices[2].u) + bcx * static_cast(vertices[0].u) + cax * static_cast(vertices[1].u); const float dvdy = +abx * static_cast(vertices[2].v) + bcx * static_cast(vertices[0].v) + cax * static_cast(vertices[1].v); const float area = bcx * cay - bcy * cax; // Detect and reject any triangles with 0 size texture area const s32 texArea = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) - (vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v); // Leverage PGXP to further avoid 3D polygons that just happen to align this way after projection const bool is_3d = (vertices[0].w != vertices[1].w || vertices[0].w != vertices[2].w); // Shouldn't matter as degenerate primitives will be culled anyways. if (area == 0.0f || texArea == 0 || is_3d) return; // Use floats here as it'll be faster than integer divides. const float rcp_area = 1.0f / area; const float dudx_area = dudx * rcp_area; const float dudy_area = dudy * rcp_area; const float dvdx_area = dvdx * rcp_area; const float dvdy_area = dvdy * rcp_area; const bool neg_dudx = dudx_area < 0.0f; const bool neg_dudy = dudy_area < 0.0f; const bool neg_dvdx = dvdx_area < 0.0f; const bool neg_dvdy = dvdy_area < 0.0f; const bool zero_dudx = dudx_area == 0.0f; const bool zero_dudy = dudy_area == 0.0f; const bool zero_dvdx = dvdx_area == 0.0f; const bool zero_dvdy = dvdy_area == 0.0f; // If we have negative dU or dV in any direction, increment the U or V to work properly with nearest-neighbor in // this impl. If we don't have 1:1 pixel correspondence, this creates a slight "shift" in the sprite, but we // guarantee that we don't sample garbage at least. Overall, this is kinda hacky because there can be legitimate, // rare cases where 3D meshes hit this scenario, and a single texel offset can pop in, but this is way better than // having borked 2D overall. // // TODO: If perf becomes an issue, we can probably SIMD the 8 comparisons above, // create an 8-bit code, and use a LUT to get the offsets. // Case 1: U is decreasing in X, but no change in Y. // Case 2: U is decreasing in Y, but no change in X. // Case 3: V is decreasing in X, but no change in Y. // Case 4: V is decreasing in Y, but no change in X. if ((neg_dudx && zero_dudy) || (neg_dudy && zero_dudx)) { vertices[0].u++; vertices[1].u++; vertices[2].u++; vertices[3].u++; } if ((neg_dvdx && zero_dvdy) || (neg_dvdy && zero_dvdx)) { vertices[0].v++; vertices[1].v++; vertices[2].v++; vertices[3].v++; } } void GPU_HW::ComputePolygonUVLimits(BatchVertex* vertices, u32 num_vertices) { u16 min_u = vertices[0].u, max_u = vertices[0].u, min_v = vertices[0].v, max_v = vertices[0].v; for (u32 i = 1; i < num_vertices; i++) { min_u = std::min(min_u, vertices[i].u); max_u = std::max(max_u, vertices[i].u); min_v = std::min(min_v, vertices[i].v); max_v = std::max(max_v, vertices[i].v); } if (min_u != max_u) max_u--; if (min_v != max_v) max_v--; for (u32 i = 0; i < num_vertices; i++) vertices[i].SetUVLimits(min_u, max_u, min_v, max_v); } void GPU_HW::SetBatchDepthBuffer(bool enabled) { if (m_batch.use_depth_buffer == enabled) return; if (GetBatchVertexCount() > 0) { FlushRender(); EnsureVertexBufferSpaceForCurrentCommand(); } m_batch.use_depth_buffer = enabled; } void GPU_HW::CheckForDepthClear(const BatchVertex* vertices, u32 num_vertices) { DebugAssert(num_vertices == 3 || num_vertices == 4); float average_z; if (num_vertices == 3) average_z = std::min((vertices[0].w + vertices[1].w + vertices[2].w) / 3.0f, 1.0f); else average_z = std::min((vertices[0].w + vertices[1].w + vertices[2].w + vertices[3].w) / 4.0f, 1.0f); if ((average_z - m_last_depth_z) >= g_settings.gpu_pgxp_depth_clear_threshold) { if (GetBatchVertexCount() > 0) { FlushRender(); EnsureVertexBufferSpaceForCurrentCommand(); } ClearDepthBuffer(); } m_last_depth_z = average_z; } u32 GPU_HW::GetAdaptiveDownsamplingMipLevels() const { u32 levels = 0; u32 current_width = VRAM_WIDTH * m_resolution_scale; while (current_width >= VRAM_WIDTH) { levels++; current_width /= 2; } return levels; } void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1, float depth) { const float dx = x1 - x0; const float dy = y1 - y0; std::array output; if (dx == 0.0f && dy == 0.0f) { // Degenerate, render a point. output[0].Set(x0, y0, depth, 1.0f, col0, 0, 0, 0); output[1].Set(x0 + 1.0f, y0, depth, 1.0f, col0, 0, 0, 0); output[2].Set(x1, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0); output[3].Set(x1 + 1.0f, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0); } else { const float abs_dx = std::fabs(dx); const float abs_dy = std::fabs(dy); float fill_dx, fill_dy; float dxdk, dydk; float pad_x0 = 0.0f; float pad_x1 = 0.0f; float pad_y0 = 0.0f; float pad_y1 = 0.0f; // Check for vertical or horizontal major lines. // When expanding to a rect, do so in the appropriate direction. // FIXME: This scheme seems to kinda work, but it seems very hard to find a method // that looks perfect on every game. // Vagrant Story speech bubbles are a very good test case here! if (abs_dx > abs_dy) { fill_dx = 0.0f; fill_dy = 1.0f; dxdk = 1.0f; dydk = dy / abs_dx; if (dx > 0.0f) { // Right pad_x1 = 1.0f; pad_y1 = dydk; } else { // Left pad_x0 = 1.0f; pad_y0 = -dydk; } } else { fill_dx = 1.0f; fill_dy = 0.0f; dydk = 1.0f; dxdk = dx / abs_dy; if (dy > 0.0f) { // Down pad_y1 = 1.0f; pad_x1 = dxdk; } else { // Up pad_y0 = 1.0f; pad_x0 = -dxdk; } } const float ox0 = x0 + pad_x0; const float oy0 = y0 + pad_y0; const float ox1 = x1 + pad_x1; const float oy1 = y1 + pad_y1; output[0].Set(ox0, oy0, depth, 1.0f, col0, 0, 0, 0); output[1].Set(ox0 + fill_dx, oy0 + fill_dy, depth, 1.0f, col0, 0, 0, 0); output[2].Set(ox1, oy1, depth, 1.0f, col1, 0, 0, 0); output[3].Set(ox1 + fill_dx, oy1 + fill_dy, depth, 1.0f, col1, 0, 0, 0); } AddVertex(output[0]); AddVertex(output[1]); AddVertex(output[2]); AddVertex(output[3]); AddVertex(output[2]); AddVertex(output[1]); } void GPU_HW::LoadVertices() { if (m_GPUSTAT.check_mask_before_draw) m_current_depth++; const GPURenderCommand rc{m_render_command.bits}; const u32 texpage = ZeroExtend32(m_draw_mode.mode_reg.bits) | (ZeroExtend32(m_draw_mode.palette_reg) << 16); const float depth = GetCurrentNormalizedVertexDepth(); switch (rc.primitive) { case GPUPrimitive::Polygon: { DebugAssert(GetBatchVertexSpace() >= (rc.quad_polygon ? 6u : 3u)); const u32 first_color = rc.color_for_first_vertex; const bool shaded = rc.shading_enable; const bool textured = rc.texture_enable; const bool pgxp = g_settings.gpu_pgxp_enable; const u32 num_vertices = rc.quad_polygon ? 4 : 3; std::array vertices; std::array, 4> native_vertex_positions; std::array native_texcoords; bool valid_w = g_settings.gpu_pgxp_texture_correction; for (u32 i = 0; i < num_vertices; i++) { const u32 color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color; const u64 maddr_and_pos = m_fifo.Pop(); const GPUVertexPosition vp{Truncate32(maddr_and_pos)}; const u16 texcoord = textured ? Truncate16(FifoPop()) : 0; const s32 native_x = m_drawing_offset.x + vp.x; const s32 native_y = m_drawing_offset.y + vp.y; native_vertex_positions[i][0] = native_x; native_vertex_positions[i][1] = native_y; native_texcoords[i] = texcoord; vertices[i].Set(static_cast(native_x), static_cast(native_y), depth, 1.0f, color, texpage, texcoord, 0xFFFF0000u); if (pgxp) { valid_w &= PGXP::GetPreciseVertex(Truncate32(maddr_and_pos >> 32), vp.bits, native_x, native_y, m_drawing_offset.x, m_drawing_offset.y, &vertices[i].x, &vertices[i].y, &vertices[i].w); } } if (pgxp) { if (!valid_w) { SetBatchDepthBuffer(false); for (BatchVertex& v : vertices) v.w = 1.0f; } else if (m_pgxp_depth_buffer) { const bool use_depth = (m_batch.transparency_mode == GPUTransparencyMode::Disabled); SetBatchDepthBuffer(use_depth); if (use_depth) CheckForDepthClear(vertices.data(), num_vertices); } } if (rc.quad_polygon && m_resolution_scale > 1) HandleFlippedQuadTextureCoordinates(vertices.data()); if (m_using_uv_limits && textured) ComputePolygonUVLimits(vertices.data(), num_vertices); if (!IsDrawingAreaIsValid()) return; // Cull polygons which are too large. const auto [min_x_12, max_x_12] = MinMax(native_vertex_positions[1][0], native_vertex_positions[2][0]); const auto [min_y_12, max_y_12] = MinMax(native_vertex_positions[1][1], native_vertex_positions[2][1]); const s32 min_x = std::min(min_x_12, native_vertex_positions[0][0]); const s32 max_x = std::max(max_x_12, native_vertex_positions[0][0]); const s32 min_y = std::min(min_y_12, native_vertex_positions[0][1]); const s32 max_y = std::max(max_y_12, native_vertex_positions[0][1]); if ((max_x - min_x) >= MAX_PRIMITIVE_WIDTH || (max_y - min_y) >= MAX_PRIMITIVE_HEIGHT) { Log_DebugPrintf("Culling too-large polygon: %d,%d %d,%d %d,%d", native_vertex_positions[0][0], native_vertex_positions[0][1], native_vertex_positions[1][0], native_vertex_positions[1][1], native_vertex_positions[2][0], native_vertex_positions[2][1]); } else { const u32 clip_left = static_cast(std::clamp(min_x, m_drawing_area.left, m_drawing_area.right)); const u32 clip_right = static_cast(std::clamp(max_x, m_drawing_area.left, m_drawing_area.right)) + 1u; const u32 clip_top = static_cast(std::clamp(min_y, m_drawing_area.top, m_drawing_area.bottom)); const u32 clip_bottom = static_cast(std::clamp(max_y, m_drawing_area.top, m_drawing_area.bottom)) + 1u; m_vram_dirty_rect.Include(clip_left, clip_right, clip_top, clip_bottom); AddDrawTriangleTicks(native_vertex_positions[0][0], native_vertex_positions[0][1], native_vertex_positions[1][0], native_vertex_positions[1][1], native_vertex_positions[2][0], native_vertex_positions[2][1], rc.shading_enable, rc.texture_enable, rc.transparency_enable); std::memcpy(m_batch_current_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 3); m_batch_current_vertex_ptr += 3; } // quads if (rc.quad_polygon) { const s32 min_x_123 = std::min(min_x_12, native_vertex_positions[3][0]); const s32 max_x_123 = std::max(max_x_12, native_vertex_positions[3][0]); const s32 min_y_123 = std::min(min_y_12, native_vertex_positions[3][1]); const s32 max_y_123 = std::max(max_y_12, native_vertex_positions[3][1]); // Cull polygons which are too large. if ((max_x_123 - min_x_123) >= MAX_PRIMITIVE_WIDTH || (max_y_123 - min_y_123) >= MAX_PRIMITIVE_HEIGHT) { Log_DebugPrintf("Culling too-large polygon (quad second half): %d,%d %d,%d %d,%d", native_vertex_positions[2][0], native_vertex_positions[2][1], native_vertex_positions[1][0], native_vertex_positions[1][1], native_vertex_positions[0][0], native_vertex_positions[0][1]); } else { const u32 clip_left = static_cast(std::clamp(min_x_123, m_drawing_area.left, m_drawing_area.right)); const u32 clip_right = static_cast(std::clamp(max_x_123, m_drawing_area.left, m_drawing_area.right)) + 1u; const u32 clip_top = static_cast(std::clamp(min_y_123, m_drawing_area.top, m_drawing_area.bottom)); const u32 clip_bottom = static_cast(std::clamp(max_y_123, m_drawing_area.top, m_drawing_area.bottom)) + 1u; m_vram_dirty_rect.Include(clip_left, clip_right, clip_top, clip_bottom); AddDrawTriangleTicks(native_vertex_positions[2][0], native_vertex_positions[2][1], native_vertex_positions[1][0], native_vertex_positions[1][1], native_vertex_positions[3][0], native_vertex_positions[3][1], rc.shading_enable, rc.texture_enable, rc.transparency_enable); AddVertex(vertices[2]); AddVertex(vertices[1]); AddVertex(vertices[3]); } } if (m_sw_renderer) { GPUBackendDrawPolygonCommand* cmd = m_sw_renderer->NewDrawPolygonCommand(num_vertices); FillDrawCommand(cmd, rc); for (u32 i = 0; i < num_vertices; i++) { GPUBackendDrawPolygonCommand::Vertex* vert = &cmd->vertices[i]; vert->x = native_vertex_positions[i][0]; vert->y = native_vertex_positions[i][1]; vert->texcoord = native_texcoords[i]; vert->color = vertices[i].color; } m_sw_renderer->PushCommand(cmd); } } break; case GPUPrimitive::Rectangle: { const u32 color = rc.color_for_first_vertex; const GPUVertexPosition vp{FifoPop()}; const s32 pos_x = TruncateGPUVertexPosition(m_drawing_offset.x + vp.x); const s32 pos_y = TruncateGPUVertexPosition(m_drawing_offset.y + vp.y); const auto [texcoord_x, texcoord_y] = UnpackTexcoord(rc.texture_enable ? Truncate16(FifoPop()) : 0); u16 orig_tex_left = ZeroExtend16(texcoord_x); u16 orig_tex_top = ZeroExtend16(texcoord_y); s32 rectangle_width; s32 rectangle_height; switch (rc.rectangle_size) { case GPUDrawRectangleSize::R1x1: rectangle_width = 1; rectangle_height = 1; break; case GPUDrawRectangleSize::R8x8: rectangle_width = 8; rectangle_height = 8; break; case GPUDrawRectangleSize::R16x16: rectangle_width = 16; rectangle_height = 16; break; default: { const u32 width_and_height = FifoPop(); rectangle_width = static_cast(width_and_height & VRAM_WIDTH_MASK); rectangle_height = static_cast((width_and_height >> 16) & VRAM_HEIGHT_MASK); if (rectangle_width >= MAX_PRIMITIVE_WIDTH || rectangle_height >= MAX_PRIMITIVE_HEIGHT) { Log_DebugPrintf("Culling too-large rectangle: %d,%d %dx%d", pos_x, pos_y, rectangle_width, rectangle_height); return; } } break; } if (!IsDrawingAreaIsValid()) return; // we can split the rectangle up into potentially 8 quads SetBatchDepthBuffer(false); DebugAssert(GetBatchVertexSpace() >= MAX_VERTICES_FOR_RECTANGLE); // Split the rectangle into multiple quads if it's greater than 256x256, as the texture page should repeat. u16 tex_top = orig_tex_top; for (s32 y_offset = 0; y_offset < rectangle_height;) { const s32 quad_height = std::min(rectangle_height - y_offset, TEXTURE_PAGE_WIDTH - tex_top); const float quad_start_y = static_cast(pos_y + y_offset); const float quad_end_y = quad_start_y + static_cast(quad_height); const u16 tex_bottom = tex_top + static_cast(quad_height); u16 tex_left = orig_tex_left; for (s32 x_offset = 0; x_offset < rectangle_width;) { const s32 quad_width = std::min(rectangle_width - x_offset, TEXTURE_PAGE_HEIGHT - tex_left); const float quad_start_x = static_cast(pos_x + x_offset); const float quad_end_x = quad_start_x + static_cast(quad_width); const u16 tex_right = tex_left + static_cast(quad_width); const u32 uv_limits = BatchVertex::PackUVLimits(tex_left, tex_right - 1, tex_top, tex_bottom - 1); AddNewVertex(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, tex_left, tex_top, uv_limits); AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits); AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits); AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits); AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits); AddNewVertex(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, tex_right, tex_bottom, uv_limits); x_offset += quad_width; tex_left = 0; } y_offset += quad_height; tex_top = 0; } const u32 clip_left = static_cast(std::clamp(pos_x, m_drawing_area.left, m_drawing_area.right)); const u32 clip_right = static_cast(std::clamp(pos_x + rectangle_width, m_drawing_area.left, m_drawing_area.right)) + 1u; const u32 clip_top = static_cast(std::clamp(pos_y, m_drawing_area.top, m_drawing_area.bottom)); const u32 clip_bottom = static_cast(std::clamp(pos_y + rectangle_height, m_drawing_area.top, m_drawing_area.bottom)) + 1u; m_vram_dirty_rect.Include(clip_left, clip_right, clip_top, clip_bottom); AddDrawRectangleTicks(clip_right - clip_left, clip_bottom - clip_top, rc.texture_enable, rc.transparency_enable); if (m_sw_renderer) { GPUBackendDrawRectangleCommand* cmd = m_sw_renderer->NewDrawRectangleCommand(); FillDrawCommand(cmd, rc); cmd->color = color; cmd->x = pos_x; cmd->y = pos_y; cmd->width = static_cast(rectangle_width); cmd->height = static_cast(rectangle_height); cmd->texcoord = (static_cast(texcoord_y) << 8) | static_cast(texcoord_x); m_sw_renderer->PushCommand(cmd); } } break; case GPUPrimitive::Line: { SetBatchDepthBuffer(false); if (!rc.polyline) { DebugAssert(GetBatchVertexSpace() >= 2); u32 start_color, end_color; GPUVertexPosition start_pos, end_pos; if (rc.shading_enable) { start_color = rc.color_for_first_vertex; start_pos.bits = FifoPop(); end_color = FifoPop() & UINT32_C(0x00FFFFFF); end_pos.bits = FifoPop(); } else { start_color = end_color = rc.color_for_first_vertex; start_pos.bits = FifoPop(); end_pos.bits = FifoPop(); } if (!IsDrawingAreaIsValid()) return; s32 start_x = start_pos.x + m_drawing_offset.x; s32 start_y = start_pos.y + m_drawing_offset.y; s32 end_x = end_pos.x + m_drawing_offset.x; s32 end_y = end_pos.y + m_drawing_offset.y; const auto [min_x, max_x] = MinMax(start_x, end_x); const auto [min_y, max_y] = MinMax(start_y, end_y); if ((max_x - min_x) >= MAX_PRIMITIVE_WIDTH || (max_y - min_y) >= MAX_PRIMITIVE_HEIGHT) { Log_DebugPrintf("Culling too-large line: %d,%d - %d,%d", start_x, start_y, end_x, end_y); return; } const u32 clip_left = static_cast(std::clamp(min_x, m_drawing_area.left, m_drawing_area.right)); const u32 clip_right = static_cast(std::clamp(max_x, m_drawing_area.left, m_drawing_area.right)) + 1u; const u32 clip_top = static_cast(std::clamp(min_y, m_drawing_area.top, m_drawing_area.bottom)); const u32 clip_bottom = static_cast(std::clamp(max_y, m_drawing_area.top, m_drawing_area.bottom)) + 1u; m_vram_dirty_rect.Include(clip_left, clip_right, clip_top, clip_bottom); AddDrawLineTicks(clip_right - clip_left, clip_bottom - clip_top, rc.shading_enable); // TODO: Should we do a PGXP lookup here? Most lines are 2D. DrawLine(static_cast(start_x), static_cast(start_y), start_color, static_cast(end_x), static_cast(end_y), end_color, depth); if (m_sw_renderer) { GPUBackendDrawLineCommand* cmd = m_sw_renderer->NewDrawLineCommand(2); FillDrawCommand(cmd, rc); cmd->vertices[0].Set(start_x, start_y, start_color); cmd->vertices[1].Set(end_x, end_y, end_color); m_sw_renderer->PushCommand(cmd); } } else { // Multiply by two because we don't use line strips. const u32 num_vertices = GetPolyLineVertexCount(); DebugAssert(GetBatchVertexSpace() >= (num_vertices * 2)); if (!IsDrawingAreaIsValid()) return; const bool shaded = rc.shading_enable; u32 buffer_pos = 0; const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]}; s32 start_x = start_vp.x + m_drawing_offset.x; s32 start_y = start_vp.y + m_drawing_offset.y; u32 start_color = rc.color_for_first_vertex; GPUBackendDrawLineCommand* cmd; if (m_sw_renderer) { cmd = m_sw_renderer->NewDrawLineCommand(num_vertices); FillDrawCommand(cmd, rc); cmd->vertices[0].Set(start_x, start_y, start_color); } else { cmd = nullptr; } for (u32 i = 1; i < num_vertices; i++) { const u32 end_color = shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : start_color; const GPUVertexPosition vp{m_blit_buffer[buffer_pos++]}; const s32 end_x = m_drawing_offset.x + vp.x; const s32 end_y = m_drawing_offset.y + vp.y; const auto [min_x, max_x] = MinMax(start_x, end_x); const auto [min_y, max_y] = MinMax(start_y, end_y); if ((max_x - min_x) >= MAX_PRIMITIVE_WIDTH || (max_y - min_y) >= MAX_PRIMITIVE_HEIGHT) { Log_DebugPrintf("Culling too-large line: %d,%d - %d,%d", start_x, start_y, end_x, end_y); } else { const u32 clip_left = static_cast(std::clamp(min_x, m_drawing_area.left, m_drawing_area.right)); const u32 clip_right = static_cast(std::clamp(max_x, m_drawing_area.left, m_drawing_area.right)) + 1u; const u32 clip_top = static_cast(std::clamp(min_y, m_drawing_area.top, m_drawing_area.bottom)); const u32 clip_bottom = static_cast(std::clamp(max_y, m_drawing_area.top, m_drawing_area.bottom)) + 1u; m_vram_dirty_rect.Include(clip_left, clip_right, clip_top, clip_bottom); AddDrawLineTicks(clip_right - clip_left, clip_bottom - clip_top, rc.shading_enable); // TODO: Should we do a PGXP lookup here? Most lines are 2D. DrawLine(static_cast(start_x), static_cast(start_y), start_color, static_cast(end_x), static_cast(end_y), end_color, depth); } start_x = end_x; start_y = end_y; start_color = end_color; if (cmd) cmd->vertices[i].Set(end_x, end_y, end_color); } if (cmd) m_sw_renderer->PushCommand(cmd); } } break; default: UnreachableCode(); break; } } bool GPU_HW::BlitVRAMReplacementTexture(const TextureReplacementTexture* tex, u32 dst_x, u32 dst_y, u32 width, u32 height) { if (!m_vram_replacement_texture || m_vram_replacement_texture->GetWidth() < tex->GetWidth() || m_vram_replacement_texture->GetHeight() < tex->GetHeight()) { m_vram_replacement_texture.reset(); if (!(m_vram_replacement_texture = g_gpu_device->CreateTexture(tex->GetWidth(), tex->GetHeight(), 1, 1, 1, GPUTexture::Type::Texture, GPUTexture::Format::RGBA8, tex->GetPixels(), tex->GetPitch(), true))) { return false; } } else { if (!m_vram_replacement_texture->Update(0, 0, width, height, tex->GetPixels(), tex->GetPitch())) { Log_ErrorPrintf("Update %ux%u texture failed.", width, height); return false; } } g_gpu_device->SetFramebuffer(m_vram_framebuffer.get()); // TODO: needed? g_gpu_device->SetTextureSampler(0, m_vram_replacement_texture.get(), g_gpu_device->GetLinearSampler()); g_gpu_device->SetPipeline(m_copy_pipeline.get()); g_gpu_device->SetViewportAndScissor(dst_x, dst_y, width, height); g_gpu_device->Draw(3, 0); RestoreDeviceContext(); return true; } void GPU_HW::IncludeVRAMDirtyRectangle(const Common::Rectangle& rect) { m_vram_dirty_rect.Include(rect); // the vram area can include the texture page, but the game can leave it as-is. in this case, set it as dirty so the // shadow texture is updated if (!m_draw_mode.IsTexturePageChanged() && (m_draw_mode.mode_reg.GetTexturePageRectangle().Intersects(rect) || (m_draw_mode.mode_reg.IsUsingPalette() && m_draw_mode.GetTexturePaletteRectangle().Intersects(rect)))) { m_draw_mode.SetTexturePageChanged(); } } GPU_HW::InterlacedRenderMode GPU_HW::GetInterlacedRenderMode() const { if (IsInterlacedDisplayEnabled()) { return m_GPUSTAT.vertical_resolution ? InterlacedRenderMode::InterleavedFields : InterlacedRenderMode::SeparateFields; } else { return InterlacedRenderMode::None; } } void GPU_HW::EnsureVertexBufferSpace(u32 required_vertices) { if (m_batch_current_vertex_ptr) { if (GetBatchVertexSpace() >= required_vertices) return; FlushRender(); } MapBatchVertexPointer(required_vertices); } void GPU_HW::EnsureVertexBufferSpaceForCurrentCommand() { u32 required_vertices; switch (m_render_command.primitive) { case GPUPrimitive::Polygon: required_vertices = m_render_command.quad_polygon ? 6 : 3; break; case GPUPrimitive::Rectangle: required_vertices = MAX_VERTICES_FOR_RECTANGLE; break; case GPUPrimitive::Line: default: required_vertices = m_render_command.polyline ? (GetPolyLineVertexCount() * 6u) : 6u; break; } // can we fit these vertices in the current depth buffer range? if ((m_current_depth + required_vertices) > MAX_BATCH_VERTEX_COUNTER_IDS) { // implies FlushRender() ResetBatchVertexDepth(); } else if (m_batch_current_vertex_ptr) { if (GetBatchVertexSpace() >= required_vertices) return; FlushRender(); } MapBatchVertexPointer(required_vertices); } void GPU_HW::ResetBatchVertexDepth() { if (m_pgxp_depth_buffer) return; Log_PerfPrint("Resetting batch vertex depth"); FlushRender(); UpdateDepthBufferFromMaskBit(); m_current_depth = 1; } void GPU_HW::UpdateSoftwareRenderer(bool copy_vram_from_hw) { const bool current_enabled = (m_sw_renderer != nullptr); const bool new_enabled = g_settings.gpu_use_software_renderer_for_readbacks; if (current_enabled == new_enabled) return; m_vram_ptr = m_vram_shadow.data(); if (!new_enabled) { if (m_sw_renderer) m_sw_renderer->Shutdown(); m_sw_renderer.reset(); return; } std::unique_ptr sw_renderer = std::make_unique(); if (!sw_renderer->Initialize(true)) return; // We need to fill in the SW renderer's VRAM with the current state for hot toggles. if (copy_vram_from_hw) { FlushRender(); ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT); std::memcpy(sw_renderer->GetVRAM(), m_vram_ptr, sizeof(u16) * VRAM_WIDTH * VRAM_HEIGHT); // Sync the drawing area. GPUBackendSetDrawingAreaCommand* cmd = sw_renderer->NewSetDrawingAreaCommand(); cmd->new_area = m_drawing_area; sw_renderer->PushCommand(cmd); } m_sw_renderer = std::move(sw_renderer); m_vram_ptr = m_sw_renderer->GetVRAM(); } void GPU_HW::FillBackendCommandParameters(GPUBackendCommand* cmd) const { cmd->params.bits = 0; cmd->params.check_mask_before_draw = m_GPUSTAT.check_mask_before_draw; cmd->params.set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing; cmd->params.active_line_lsb = m_crtc_state.active_line_lsb; cmd->params.interlaced_rendering = m_GPUSTAT.SkipDrawingToActiveField(); } void GPU_HW::FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc) const { FillBackendCommandParameters(cmd); cmd->rc.bits = rc.bits; cmd->draw_mode.bits = m_draw_mode.mode_reg.bits; cmd->palette.bits = m_draw_mode.palette_reg; cmd->window = m_draw_mode.texture_window; } void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) { if (m_sw_renderer) { GPUBackendFillVRAMCommand* cmd = m_sw_renderer->NewFillVRAMCommand(); FillBackendCommandParameters(cmd); cmd->x = static_cast(x); cmd->y = static_cast(y); cmd->width = static_cast(width); cmd->height = static_cast(height); cmd->color = color; m_sw_renderer->PushCommand(cmd); } IncludeVRAMDirtyRectangle( Common::Rectangle::FromExtents(x, y, width, height).Clamped(0, 0, VRAM_WIDTH, VRAM_HEIGHT)); const bool is_oversized = (((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT)); g_gpu_device->SetPipeline( m_vram_fill_pipelines[BoolToUInt8(is_oversized)][BoolToUInt8(IsInterlacedRenderingEnabled())].get()); const Common::Rectangle bounds(GetVRAMTransferBounds(x, y, width, height)); g_gpu_device->SetViewportAndScissor(bounds.left * m_resolution_scale, bounds.top * m_resolution_scale, bounds.GetWidth() * m_resolution_scale, bounds.GetHeight() * m_resolution_scale); struct VRAMFillUBOData { u32 u_dst_x; u32 u_dst_y; u32 u_end_x; u32 u_end_y; std::array u_fill_color; u32 u_interlaced_displayed_field; }; VRAMFillUBOData uniforms; uniforms.u_dst_x = (x % VRAM_WIDTH) * m_resolution_scale; uniforms.u_dst_y = (y % VRAM_HEIGHT) * m_resolution_scale; uniforms.u_end_x = ((x + width) % VRAM_WIDTH) * m_resolution_scale; uniforms.u_end_y = ((y + height) % VRAM_HEIGHT) * m_resolution_scale; // drop precision unless true colour is enabled uniforms.u_fill_color = GPUDevice::RGBA8ToFloat(m_true_color ? color : VRAMRGBA5551ToRGBA8888(VRAMRGBA8888ToRGBA5551(color))); uniforms.u_interlaced_displayed_field = GetActiveLineLSB(); g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); g_gpu_device->Draw(3, 0); RestoreDeviceContext(); } void GPU_HW::ReadVRAM(u32 x, u32 y, u32 width, u32 height) { if (m_sw_renderer) { m_sw_renderer->Sync(false); return; } // Get bounds with wrap-around handled. const Common::Rectangle copy_rect = GetVRAMTransferBounds(x, y, width, height); const u32 encoded_width = (copy_rect.GetWidth() + 1) / 2; const u32 encoded_height = copy_rect.GetHeight(); // Encode the 24-bit texture as 16-bit. const u32 uniforms[4] = {copy_rect.left, copy_rect.top, copy_rect.GetWidth(), copy_rect.GetHeight()}; g_gpu_device->SetFramebuffer(m_vram_readback_framebuffer.get()); g_gpu_device->SetPipeline(m_vram_readback_pipeline.get()); g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler()); g_gpu_device->SetViewportAndScissor(0, 0, encoded_width, encoded_height); g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms)); g_gpu_device->Draw(3, 0); m_vram_readback_texture->MakeReadyForSampling(); // Stage the readback and copy it into our shadow buffer. g_gpu_device->DownloadTexture(m_vram_readback_texture.get(), 0, 0, encoded_width, encoded_height, reinterpret_cast(&m_vram_shadow[copy_rect.top * VRAM_WIDTH + copy_rect.left]), VRAM_WIDTH * sizeof(u16)); RestoreDeviceContext(); } void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) { if (m_sw_renderer) { const u32 num_words = width * height; GPUBackendUpdateVRAMCommand* cmd = m_sw_renderer->NewUpdateVRAMCommand(num_words); FillBackendCommandParameters(cmd); cmd->params.set_mask_while_drawing = set_mask; cmd->params.check_mask_before_draw = check_mask; cmd->x = static_cast(x); cmd->y = static_cast(y); cmd->width = static_cast(width); cmd->height = static_cast(height); std::memcpy(cmd->data, data, sizeof(u16) * num_words); m_sw_renderer->PushCommand(cmd); } const Common::Rectangle bounds = GetVRAMTransferBounds(x, y, width, height); DebugAssert(bounds.right <= VRAM_WIDTH && bounds.bottom <= VRAM_HEIGHT); IncludeVRAMDirtyRectangle(bounds); if (check_mask) { // set new vertex counter since we want this to take into consideration previous masked pixels m_current_depth++; } else { const TextureReplacementTexture* rtex = g_texture_replacements.GetVRAMWriteReplacement(width, height, data); if (rtex && BlitVRAMReplacementTexture(rtex, x * m_resolution_scale, y * m_resolution_scale, width * m_resolution_scale, height * m_resolution_scale)) { return; } } const u32 num_pixels = width * height; void* map = m_vram_upload_buffer->Map(num_pixels); const u32 map_index = m_vram_upload_buffer->GetCurrentPosition(); std::memcpy(map, data, num_pixels * sizeof(u16)); m_vram_upload_buffer->Unmap(num_pixels); struct VRAMWriteUBOData { u32 u_dst_x; u32 u_dst_y; u32 u_end_x; u32 u_end_y; u32 u_width; u32 u_height; u32 u_buffer_base_offset; u32 u_mask_or_bits; float u_depth_value; }; const VRAMWriteUBOData uniforms = { (x % VRAM_WIDTH), (y % VRAM_HEIGHT), ((x + width) % VRAM_WIDTH), ((y + height) % VRAM_HEIGHT), width, height, map_index, (set_mask) ? 0x8000u : 0x00, GetCurrentNormalizedVertexDepth()}; // the viewport should already be set to the full vram, so just adjust the scissor const Common::Rectangle scaled_bounds = bounds * m_resolution_scale; g_gpu_device->SetScissor(scaled_bounds.left, scaled_bounds.top, scaled_bounds.GetWidth(), scaled_bounds.GetHeight()); g_gpu_device->SetPipeline(m_vram_write_pipelines[BoolToUInt8(check_mask && !m_pgxp_depth_buffer)].get()); g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); g_gpu_device->SetTextureBuffer(0, m_vram_upload_buffer.get()); g_gpu_device->Draw(3, 0); RestoreDeviceContext(); } void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) { if (m_sw_renderer) { GPUBackendCopyVRAMCommand* cmd = m_sw_renderer->NewCopyVRAMCommand(); FillBackendCommandParameters(cmd); cmd->src_x = static_cast(src_x); cmd->src_y = static_cast(src_y); cmd->dst_x = static_cast(dst_x); cmd->dst_y = static_cast(dst_y); cmd->width = static_cast(width); cmd->height = static_cast(height); m_sw_renderer->PushCommand(cmd); } // masking enabled, oversized, or overlapping const bool use_shader = (m_GPUSTAT.IsMaskingEnabled() || ((src_x % VRAM_WIDTH) + width) > VRAM_WIDTH || ((src_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT || ((dst_x % VRAM_WIDTH) + width) > VRAM_WIDTH || ((dst_y % VRAM_HEIGHT) + height) > VRAM_HEIGHT || Common::Rectangle::FromExtents(src_x, src_y, width, height) .Intersects(Common::Rectangle::FromExtents(dst_x, dst_y, width, height))); if (use_shader || IsUsingMultisampling()) { const Common::Rectangle src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height); const Common::Rectangle dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height); if (m_vram_dirty_rect.Intersects(src_bounds)) UpdateVRAMReadTexture(); IncludeVRAMDirtyRectangle(dst_bounds); struct VRAMCopyUBOData { u32 u_src_x; u32 u_src_y; u32 u_dst_x; u32 u_dst_y; u32 u_end_x; u32 u_end_y; u32 u_width; u32 u_height; u32 u_set_mask_bit; float u_depth_value; }; const VRAMCopyUBOData uniforms = {(src_x % VRAM_WIDTH) * m_resolution_scale, (src_y % VRAM_HEIGHT) * m_resolution_scale, (dst_x % VRAM_WIDTH) * m_resolution_scale, (dst_y % VRAM_HEIGHT) * m_resolution_scale, ((dst_x + width) % VRAM_WIDTH) * m_resolution_scale, ((dst_y + height) % VRAM_HEIGHT) * m_resolution_scale, width * m_resolution_scale, height * m_resolution_scale, m_GPUSTAT.set_mask_while_drawing ? 1u : 0u, GetCurrentNormalizedVertexDepth()}; // VRAM read texture should already be bound. const Common::Rectangle dst_bounds_scaled(dst_bounds * m_resolution_scale); g_gpu_device->SetViewportAndScissor(dst_bounds_scaled.left, dst_bounds_scaled.top, dst_bounds_scaled.GetWidth(), dst_bounds_scaled.GetHeight()); g_gpu_device->SetPipeline( m_vram_copy_pipelines[BoolToUInt8(m_GPUSTAT.check_mask_before_draw && !m_pgxp_depth_buffer)].get()); g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); g_gpu_device->Draw(3, 0); RestoreDeviceContext(); if (m_GPUSTAT.check_mask_before_draw && !m_pgxp_depth_buffer) m_current_depth++; return; } // We can't CopySubresourceRegion to the same resource. So use the shadow texture if we can, but that may need to be // updated first. Copying to the same resource seemed to work on Windows 10, but breaks on Windows 7. But, it's // against the API spec, so better to be safe than sorry. // TODO: make this an optional feature, DX12 can do it if (m_vram_dirty_rect.Intersects(Common::Rectangle::FromExtents(src_x, src_y, width, height))) UpdateVRAMReadTexture(); IncludeVRAMDirtyRectangle( Common::Rectangle::FromExtents(dst_x, dst_y, width, height).Clamped(0, 0, VRAM_WIDTH, VRAM_HEIGHT)); if (m_GPUSTAT.check_mask_before_draw) { // set new vertex counter since we want this to take into consideration previous masked pixels m_current_depth++; } g_gpu_device->CopyTextureRegion(m_vram_texture.get(), dst_x * m_resolution_scale, dst_y * m_resolution_scale, 0, 0, m_vram_read_texture.get(), src_x * m_resolution_scale, src_y * m_resolution_scale, 0, 0, width * m_resolution_scale, height * m_resolution_scale); m_vram_read_texture->MakeReadyForSampling(); } void GPU_HW::DispatchRenderCommand() { const GPURenderCommand rc{m_render_command.bits}; GPUTextureMode texture_mode; if (rc.IsTexturingEnabled()) { // texture page changed - check that the new page doesn't intersect the drawing area if (m_draw_mode.IsTexturePageChanged()) { m_draw_mode.ClearTexturePageChangedFlag(); if (m_vram_dirty_rect.Valid() && (m_draw_mode.mode_reg.GetTexturePageRectangle().Intersects(m_vram_dirty_rect) || (m_draw_mode.mode_reg.IsUsingPalette() && m_draw_mode.GetTexturePaletteRectangle().Intersects(m_vram_dirty_rect)))) { // Log_DevPrintf("Invalidating VRAM read cache due to drawing area overlap"); if (!IsFlushed()) FlushRender(); UpdateVRAMReadTexture(); } } texture_mode = m_draw_mode.mode_reg.texture_mode; if (rc.raw_texture_enable) { texture_mode = static_cast(static_cast(texture_mode) | static_cast(GPUTextureMode::RawTextureBit)); } } else { texture_mode = GPUTextureMode::Disabled; } // has any state changed which requires a new batch? const GPUTransparencyMode transparency_mode = rc.transparency_enable ? m_draw_mode.mode_reg.transparency_mode : GPUTransparencyMode::Disabled; const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false; if (texture_mode != m_batch.texture_mode || transparency_mode != m_batch.transparency_mode || transparency_mode == GPUTransparencyMode::BackgroundMinusForeground || dithering_enable != m_batch.dithering) { FlushRender(); } EnsureVertexBufferSpaceForCurrentCommand(); // transparency mode change if (m_batch.transparency_mode != transparency_mode && transparency_mode != GPUTransparencyMode::Disabled) { static constexpr float transparent_alpha[4][2] = {{0.5f, 0.5f}, {1.0f, 1.0f}, {1.0f, 1.0f}, {0.25f, 1.0f}}; const float src_alpha_factor = transparent_alpha[static_cast(transparency_mode)][0]; const float dst_alpha_factor = transparent_alpha[static_cast(transparency_mode)][1]; m_batch_ubo_dirty |= (m_batch_ubo_data.u_src_alpha_factor != src_alpha_factor || m_batch_ubo_data.u_dst_alpha_factor != dst_alpha_factor); m_batch_ubo_data.u_src_alpha_factor = src_alpha_factor; m_batch_ubo_data.u_dst_alpha_factor = dst_alpha_factor; } const bool check_mask_before_draw = m_GPUSTAT.check_mask_before_draw; const bool set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing; if (m_batch.check_mask_before_draw != check_mask_before_draw || m_batch.set_mask_while_drawing != set_mask_while_drawing) { m_batch.check_mask_before_draw = check_mask_before_draw; m_batch.set_mask_while_drawing = set_mask_while_drawing; m_batch_ubo_dirty |= (m_batch_ubo_data.u_set_mask_while_drawing != BoolToUInt32(set_mask_while_drawing)); m_batch_ubo_data.u_set_mask_while_drawing = BoolToUInt32(set_mask_while_drawing); } m_batch.interlacing = IsInterlacedRenderingEnabled(); if (m_batch.interlacing) { const u32 displayed_field = GetActiveLineLSB(); m_batch_ubo_dirty |= (m_batch_ubo_data.u_interlaced_displayed_field != displayed_field); m_batch_ubo_data.u_interlaced_displayed_field = displayed_field; } // update state m_batch.texture_mode = texture_mode; m_batch.transparency_mode = transparency_mode; m_batch.dithering = dithering_enable; if (m_draw_mode.IsTextureWindowChanged()) { m_draw_mode.ClearTextureWindowChangedFlag(); m_batch_ubo_data.u_texture_window_and[0] = ZeroExtend32(m_draw_mode.texture_window.and_x); m_batch_ubo_data.u_texture_window_and[1] = ZeroExtend32(m_draw_mode.texture_window.and_y); m_batch_ubo_data.u_texture_window_or[0] = ZeroExtend32(m_draw_mode.texture_window.or_x); m_batch_ubo_data.u_texture_window_or[1] = ZeroExtend32(m_draw_mode.texture_window.or_y); m_batch_ubo_dirty = true; } if (m_drawing_area_changed) { m_drawing_area_changed = false; SetScissor(); if (m_pgxp_depth_buffer && m_last_depth_z < 1.0f) ClearDepthBuffer(); if (m_sw_renderer) { GPUBackendSetDrawingAreaCommand* cmd = m_sw_renderer->NewSetDrawingAreaCommand(); cmd->new_area = m_drawing_area; m_sw_renderer->PushCommand(cmd); } } LoadVertices(); } void GPU_HW::FlushRender() { if (!m_batch_current_vertex_ptr) return; const u32 vertex_count = GetBatchVertexCount(); UnmapBatchVertexPointer(vertex_count); if (vertex_count == 0) return; #ifdef _DEBUG GL_SCOPE("Hardware Draw %u", ++s_draw_number); #endif if (m_batch_ubo_dirty) { g_gpu_device->UploadUniformBuffer(&m_batch_ubo_data, sizeof(m_batch_ubo_data)); m_renderer_stats.num_uniform_buffer_updates++; m_batch_ubo_dirty = false; } if (m_wireframe_mode != GPUWireframeMode::OnlyWireframe) { if (NeedsTwoPassRendering()) { m_renderer_stats.num_batches += 2; DrawBatchVertices(BatchRenderMode::OnlyOpaque, vertex_count, m_batch_base_vertex); DrawBatchVertices(BatchRenderMode::OnlyTransparent, vertex_count, m_batch_base_vertex); } else { m_renderer_stats.num_batches++; DrawBatchVertices(m_batch.GetRenderMode(), vertex_count, m_batch_base_vertex); } } if (m_wireframe_mode != GPUWireframeMode::Disabled) { m_renderer_stats.num_batches++; g_gpu_device->SetPipeline(m_wireframe_pipeline.get()); g_gpu_device->Draw(vertex_count, m_batch_base_vertex); } } void GPU_HW::UpdateDisplay() { FlushRender(); if (g_settings.debugging.show_vram) { if (IsUsingMultisampling()) { UpdateVRAMReadTexture(); SetDisplayTexture(m_vram_read_texture.get(), 0, 0, m_vram_read_texture->GetWidth(), m_vram_read_texture->GetHeight()); } else { SetDisplayTexture(m_vram_texture.get(), 0, 0, m_vram_texture->GetWidth(), m_vram_texture->GetHeight()); } SetDisplayParameters(VRAM_WIDTH, VRAM_HEIGHT, 0, 0, VRAM_WIDTH, VRAM_HEIGHT, static_cast(VRAM_WIDTH) / static_cast(VRAM_HEIGHT)); } else { // TODO: use a dynamically sized texture SetDisplayParameters(m_crtc_state.display_width, m_crtc_state.display_height, m_crtc_state.display_origin_left, m_crtc_state.display_origin_top, m_crtc_state.display_vram_width, m_crtc_state.display_vram_height, ComputeDisplayAspectRatio()); const u32 resolution_scale = m_GPUSTAT.display_area_color_depth_24 ? 1 : m_resolution_scale; const u32 vram_offset_x = m_crtc_state.display_vram_left; const u32 vram_offset_y = m_crtc_state.display_vram_top; const u32 scaled_vram_offset_x = vram_offset_x * resolution_scale; const u32 scaled_vram_offset_y = vram_offset_y * resolution_scale; const u32 display_width = m_crtc_state.display_vram_width; const u32 display_height = m_crtc_state.display_vram_height; const u32 scaled_display_width = display_width * resolution_scale; const u32 scaled_display_height = display_height * resolution_scale; const InterlacedRenderMode interlaced = GetInterlacedRenderMode(); if (IsDisplayDisabled()) { ClearDisplayTexture(); } else if (!m_GPUSTAT.display_area_color_depth_24 && interlaced == InterlacedRenderMode::None && !IsUsingMultisampling() && (scaled_vram_offset_x + scaled_display_width) <= m_vram_texture->GetWidth() && (scaled_vram_offset_y + scaled_display_height) <= m_vram_texture->GetHeight()) { if (IsUsingDownsampling()) { DownsampleFramebuffer(m_vram_texture.get(), scaled_vram_offset_x, scaled_vram_offset_y, scaled_display_width, scaled_display_height); } else { SetDisplayTexture(m_vram_texture.get(), scaled_vram_offset_x, scaled_vram_offset_y, scaled_display_width, scaled_display_height); } } else { // TODO: discard vs load for interlaced if (interlaced == InterlacedRenderMode::None) g_gpu_device->InvalidateRenderTarget(m_display_private_texture.get()); g_gpu_device->SetFramebuffer(m_display_framebuffer.get()); g_gpu_device->SetPipeline( m_display_pipelines[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)][static_cast(interlaced)].get()); g_gpu_device->SetTextureSampler(0, m_vram_texture.get(), g_gpu_device->GetNearestSampler()); const u32 reinterpret_field_offset = (interlaced != InterlacedRenderMode::None) ? GetInterlacedDisplayField() : 0; const u32 reinterpret_start_x = m_crtc_state.regs.X * resolution_scale; const u32 reinterpret_crop_left = (m_crtc_state.display_vram_left - m_crtc_state.regs.X) * resolution_scale; const u32 uniforms[4] = {reinterpret_start_x, scaled_vram_offset_y + reinterpret_field_offset, reinterpret_crop_left, reinterpret_field_offset}; g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms)); Assert(scaled_display_width <= m_display_private_texture->GetWidth() && scaled_display_height <= m_display_private_texture->GetHeight()); g_gpu_device->SetViewportAndScissor(0, 0, scaled_display_width, scaled_display_height); g_gpu_device->Draw(3, 0); if (IsUsingDownsampling()) DownsampleFramebuffer(m_display_private_texture.get(), 0, 0, scaled_display_width, scaled_display_height); else SetDisplayTexture(m_display_private_texture.get(), 0, 0, scaled_display_width, scaled_display_height); RestoreDeviceContext(); } } } void GPU_HW::DownsampleFramebuffer(GPUTexture* source, u32 left, u32 top, u32 width, u32 height) { if (m_downsample_mode == GPUDownsampleMode::Adaptive) DownsampleFramebufferAdaptive(source, left, top, width, height); else DownsampleFramebufferBoxFilter(source, left, top, width, height); } void GPU_HW::DownsampleFramebufferAdaptive(GPUTexture* source, u32 left, u32 top, u32 width, u32 height) { GL_PUSH("DownsampleFramebufferAdaptive (%u,%u => %u,%d)", left, top, left + width, left + height); struct SmoothingUBOData { float min_uv[2]; float max_uv[2]; float rcp_size[2]; float lod; }; g_gpu_device->CopyTextureRegion(m_downsample_texture.get(), 0, 0, 0, 0, source, left, top, 0, 0, width, height); g_gpu_device->SetTextureSampler(0, m_downsample_texture.get(), m_downsample_lod_sampler.get()); const u32 levels = m_downsample_texture->GetLevels(); SmoothingUBOData uniforms; // create mip chain for (u32 level = 1; level < levels; level++) { GL_SCOPE("Create miplevel %u", level); const u32 level_width = width >> level; const u32 level_height = height >> level; const float rcp_width = 1.0f / static_cast(m_downsample_texture->GetMipWidth(level)); const float rcp_height = 1.0f / static_cast(m_downsample_texture->GetMipHeight(level)); uniforms.min_uv[0] = 0.0f; uniforms.min_uv[1] = 0.0f; uniforms.max_uv[0] = static_cast(level_width) * rcp_width; uniforms.max_uv[1] = static_cast(level_height) * rcp_height; uniforms.rcp_size[0] = rcp_width; uniforms.rcp_size[1] = rcp_height; uniforms.lod = static_cast(level - 1); g_gpu_device->InvalidateRenderTarget(m_downsample_render_texture.get()); g_gpu_device->SetFramebuffer(m_downsample_framebuffer.get()); g_gpu_device->SetViewportAndScissor(0, 0, level_width, level_height); g_gpu_device->SetPipeline((level == 1) ? m_downsample_first_pass_pipeline.get() : m_downsample_mid_pass_pipeline.get()); g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); g_gpu_device->Draw(3, 0); g_gpu_device->CopyTextureRegion(m_downsample_texture.get(), 0, 0, 0, level, m_downsample_render_texture.get(), 0, 0, 0, 0, level_width, level_height); } // blur pass at lowest level { GL_SCOPE("Blur"); const u32 last_level = levels - 1; const u32 last_width = width >> last_level; const u32 last_height = height >> last_level; const float rcp_width = 1.0f / static_cast(m_downsample_render_texture->GetWidth()); const float rcp_height = 1.0f / static_cast(m_downsample_render_texture->GetHeight()); uniforms.min_uv[0] = 0.0f; uniforms.min_uv[1] = 0.0f; uniforms.max_uv[0] = static_cast(last_width) * rcp_width; uniforms.max_uv[1] = static_cast(last_height) * rcp_height; uniforms.rcp_size[0] = rcp_width; uniforms.rcp_size[1] = rcp_height; uniforms.lod = 0.0f; m_downsample_render_texture->MakeReadyForSampling(); g_gpu_device->InvalidateRenderTarget(m_downsample_weight_texture.get()); g_gpu_device->SetFramebuffer(m_downsample_weight_framebuffer.get()); g_gpu_device->SetTextureSampler(0, m_downsample_render_texture.get(), g_gpu_device->GetNearestSampler()); g_gpu_device->SetViewportAndScissor(0, 0, last_width, last_height); g_gpu_device->SetPipeline(m_downsample_blur_pass_pipeline.get()); g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); g_gpu_device->Draw(3, 0); m_downsample_weight_texture->MakeReadyForSampling(); } // composite downsampled and upsampled images together { GL_SCOPE("Composite"); g_gpu_device->InvalidateRenderTarget(m_downsample_render_texture.get()); g_gpu_device->SetFramebuffer(m_downsample_framebuffer.get()); g_gpu_device->SetTextureSampler(0, m_downsample_texture.get(), m_downsample_composite_sampler.get()); g_gpu_device->SetTextureSampler(1, m_downsample_weight_texture.get(), m_downsample_lod_sampler.get()); g_gpu_device->SetViewportAndScissor(0, 0, width, height); g_gpu_device->SetPipeline(m_downsample_composite_pass_pipeline.get()); g_gpu_device->Draw(3, 0); m_downsample_render_texture->MakeReadyForSampling(); } GL_POP(); RestoreDeviceContext(); SetDisplayTexture(m_downsample_render_texture.get(), 0, 0, width, height); } void GPU_HW::DownsampleFramebufferBoxFilter(GPUTexture* source, u32 left, u32 top, u32 width, u32 height) { const u32 factor = m_resolution_scale / GetBoxDownsampleScale(); const u32 ds_left = left / factor; const u32 ds_top = top / factor; const u32 ds_width = width / factor; const u32 ds_height = height / factor; source->MakeReadyForSampling(); g_gpu_device->ClearRenderTarget(m_downsample_render_texture.get(), 0); g_gpu_device->SetFramebuffer(m_downsample_framebuffer.get()); g_gpu_device->SetPipeline(m_downsample_first_pass_pipeline.get()); g_gpu_device->SetTextureSampler(0, source, g_gpu_device->GetNearestSampler()); g_gpu_device->SetViewportAndScissor(ds_left, ds_top, ds_width, ds_height); g_gpu_device->Draw(3, 0); RestoreDeviceContext(); SetDisplayTexture(m_downsample_render_texture.get(), ds_left, ds_top, ds_width, ds_height); } void GPU_HW::DrawRendererStats(bool is_idle_frame) { if (!is_idle_frame) { m_last_renderer_stats = m_renderer_stats; m_renderer_stats = {}; } if (ImGui::CollapsingHeader("Renderer Statistics", ImGuiTreeNodeFlags_DefaultOpen)) { static const ImVec4 active_color{1.0f, 1.0f, 1.0f, 1.0f}; static const ImVec4 inactive_color{0.4f, 0.4f, 0.4f, 1.0f}; const auto& stats = m_last_renderer_stats; ImGui::Columns(2); ImGui::SetColumnWidth(0, 200.0f * Host::GetOSDScale()); ImGui::TextUnformatted("Resolution Scale:"); ImGui::NextColumn(); ImGui::Text("%u (VRAM %ux%u)", m_resolution_scale, VRAM_WIDTH * m_resolution_scale, VRAM_HEIGHT * m_resolution_scale); ImGui::NextColumn(); ImGui::TextUnformatted("Effective Display Resolution:"); ImGui::NextColumn(); ImGui::Text("%ux%u", m_crtc_state.display_vram_width * m_resolution_scale, m_crtc_state.display_vram_height * m_resolution_scale); ImGui::NextColumn(); ImGui::TextUnformatted("True Color:"); ImGui::NextColumn(); ImGui::TextColored(m_true_color ? active_color : inactive_color, m_true_color ? "Enabled" : "Disabled"); ImGui::NextColumn(); ImGui::TextUnformatted("Scaled Dithering:"); ImGui::NextColumn(); ImGui::TextColored(m_scaled_dithering ? active_color : inactive_color, m_scaled_dithering ? "Enabled" : "Disabled"); ImGui::NextColumn(); ImGui::TextUnformatted("Texture Filtering:"); ImGui::NextColumn(); ImGui::TextColored((m_texture_filtering != GPUTextureFilter::Nearest) ? active_color : inactive_color, "%s", Settings::GetTextureFilterDisplayName(m_texture_filtering)); ImGui::NextColumn(); ImGui::TextUnformatted("PGXP:"); ImGui::NextColumn(); ImGui::TextColored(g_settings.gpu_pgxp_enable ? active_color : inactive_color, "Geom"); ImGui::SameLine(); ImGui::TextColored((g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_culling) ? active_color : inactive_color, "Cull"); ImGui::SameLine(); ImGui::TextColored( (g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_texture_correction) ? active_color : inactive_color, "Tex"); ImGui::SameLine(); ImGui::TextColored((g_settings.gpu_pgxp_enable && g_settings.gpu_pgxp_vertex_cache) ? active_color : inactive_color, "Cache"); ImGui::NextColumn(); ImGui::TextUnformatted("Batches Drawn:"); ImGui::NextColumn(); ImGui::Text("%u", stats.num_batches); ImGui::NextColumn(); ImGui::TextUnformatted("VRAM Read Texture Updates:"); ImGui::NextColumn(); ImGui::Text("%u", stats.num_vram_read_texture_updates); ImGui::NextColumn(); ImGui::TextUnformatted("Uniform Buffer Updates: "); ImGui::NextColumn(); ImGui::Text("%u", stats.num_uniform_buffer_updates); ImGui::NextColumn(); ImGui::Columns(1); } } std::unique_ptr GPU::CreateHardwareRenderer() { std::unique_ptr gpu(std::make_unique()); if (!gpu->Initialize()) return nullptr; return gpu; }