diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 643bdf577..d00a8eb76 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -849,9 +849,8 @@ bool GPU_HW::CompilePipelines() const u32 active_texture_modes = m_allow_sprite_mode ? NUM_TEXTURE_MODES : (NUM_TEXTURE_MODES - (NUM_TEXTURE_MODES - static_cast(BatchTextureMode::SpriteStart))); - const u32 active_vertex_shaders = m_allow_sprite_mode ? 3 : 2; const u32 total_pipelines = - active_vertex_shaders + // vertex shaders + (m_allow_sprite_mode ? 5 : 3) + // vertex shaders (active_texture_modes * 5 * 9 * 2 * 2 * 2) + // fragment shaders ((m_pgxp_depth_buffer ? 2 : 1) * 5 * 5 * active_texture_modes * 2 * 2 * 2) + // batch pipelines ((m_wireframe_mode != GPUWireframeMode::Disabled) ? 1 : 0) + // wireframe @@ -867,29 +866,34 @@ bool GPU_HW::CompilePipelines() ShaderCompileProgressTracker progress("Compiling Pipelines", total_pipelines); - // vertex shaders - [non-textured/textured/sprite] + // vertex shaders - [textured/palette/sprite] // fragment shaders - [render_mode][transparency_mode][texture_mode][check_mask][dithering][interlacing] static constexpr auto destroy_shader = [](std::unique_ptr& s) { s.reset(); }; - DimensionalArray, 3> batch_vertex_shaders{}; + DimensionalArray, 2, 2, 2> batch_vertex_shaders{}; DimensionalArray, 2, 2, 2, NUM_TEXTURE_MODES, 5, 5> batch_fragment_shaders{}; ScopedGuard batch_shader_guard([&batch_vertex_shaders, &batch_fragment_shaders]() { batch_vertex_shaders.enumerate(destroy_shader); batch_fragment_shaders.enumerate(destroy_shader); }); - for (u8 textured = 0; textured < active_vertex_shaders; textured++) + for (u8 textured = 0; textured < 2; textured++) { - const bool sprite = (textured > 1); - const bool uv_limits = ShouldClampUVs(sprite ? m_sprite_texture_filtering : m_texture_filtering); - const std::string vs = shadergen.GenerateBatchVertexShader(textured != 0, uv_limits, - !sprite && force_round_texcoords, m_pgxp_depth_buffer); - if (!(batch_vertex_shaders[textured] = - g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(), vs))) + for (u8 palette = 0; palette < (textured ? 2 : 1); palette++) { - return false; - } + for (u8 sprite = 0; sprite < (textured ? 2 : 1); sprite++) + { + const bool uv_limits = ShouldClampUVs(sprite ? m_sprite_texture_filtering : m_texture_filtering); + const std::string vs = shadergen.GenerateBatchVertexShader( + textured != 0, palette != 0, uv_limits, !sprite && force_round_texcoords, m_pgxp_depth_buffer); + if (!(batch_vertex_shaders[textured][palette][sprite] = + g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(), vs))) + { + return false; + } - progress.Increment(); + progress.Increment(); + } + } } for (u8 render_mode = 0; render_mode < 5; render_mode++) @@ -1010,6 +1014,11 @@ bool GPU_HW::CompilePipelines() for (u8 check_mask = 0; check_mask < 2; check_mask++) { const bool textured = (static_cast(texture_mode) != BatchTextureMode::Disabled); + const bool palette = + (static_cast(texture_mode) == BatchTextureMode::Palette4Bit || + static_cast(texture_mode) == BatchTextureMode::Palette8Bit || + static_cast(texture_mode) == BatchTextureMode::SpritePalette4Bit || + static_cast(texture_mode) == BatchTextureMode::SpritePalette8Bit); const bool sprite = (static_cast(texture_mode) >= BatchTextureMode::SpriteStart); const bool uv_limits = ShouldClampUVs(sprite ? m_sprite_texture_filtering : m_texture_filtering); const bool use_shader_blending = @@ -1026,7 +1035,8 @@ bool GPU_HW::CompilePipelines() NUM_BATCH_TEXTURED_VERTEX_ATTRIBUTES)) : std::span(vertex_attributes, NUM_BATCH_VERTEX_ATTRIBUTES); - plconfig.vertex_shader = batch_vertex_shaders[BoolToUInt8(textured) + BoolToUInt8(sprite)].get(); + plconfig.vertex_shader = + batch_vertex_shaders[BoolToUInt8(textured)][BoolToUInt8(palette)][BoolToUInt8(sprite)].get(); plconfig.fragment_shader = batch_fragment_shaders[render_mode] [use_shader_blending ? transparency_mode : @@ -1132,7 +1142,7 @@ bool GPU_HW::CompilePipelines() GPUPipeline::BlendState::GetNoBlendingState(); plconfig.blend.write_mask = 0x7; plconfig.depth = GPUPipeline::DepthState::GetNoTestsState(); - plconfig.vertex_shader = batch_vertex_shaders[0].get(); + plconfig.vertex_shader = batch_vertex_shaders[0][0][0].get(); plconfig.geometry_shader = gs.get(); plconfig.fragment_shader = fs.get(); diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index a01945ba5..d67b3d1b2 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -57,12 +57,13 @@ void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss) false); } -std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool uv_limits, bool force_round_texcoords, - bool pgxp_depth) +std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool palette, bool uv_limits, + bool force_round_texcoords, bool pgxp_depth) { std::stringstream ss; WriteHeader(ss); DefineMacro(ss, "TEXTURED", textured); + DefineMacro(ss, "PALETTE", palette); DefineMacro(ss, "UV_LIMITS", uv_limits); DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords); DefineMacro(ss, "PGXP_DEPTH", pgxp_depth); @@ -76,14 +77,14 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool uv_l { DeclareVertexEntryPoint( ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1, - {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, false, "", UsingMSAA(), - UsingPerSampleShading(), m_disable_color_perspective); + {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, + false, "", UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective); } else { DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1, - {{"nointerpolation", "uint4 v_texpage"}}, false, "", UsingMSAA(), UsingPerSampleShading(), - m_disable_color_perspective); + {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, false, "", + UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective); } } else @@ -126,22 +127,32 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool uv_l v_col0 = a_col0; #if TEXTURED - v_tex0 = float2(float((a_texcoord & 0xFFFFu) * RESOLUTION_SCALE), - float((a_texcoord >> 16) * RESOLUTION_SCALE)); + v_tex0 = float2(uint2(a_texcoord & 0xFFFFu, a_texcoord >> 16)); + #if !PALETTE + v_tex0 *= float(RESOLUTION_SCALE); + #endif // base_x,base_y,palette_x,palette_y - // Palette X is scaled in fragment shader, since it can wrap. - v_texpage.x = (a_texpage & 15u) * 64u * RESOLUTION_SCALE; - v_texpage.y = ((a_texpage >> 4) & 1u) * 256u * RESOLUTION_SCALE; - v_texpage.z = ((a_texpage >> 16) & 63u) * 16u; - v_texpage.w = ((a_texpage >> 22) & 511u) * RESOLUTION_SCALE; + v_texpage.x = (a_texpage & 15u) * 64u; + v_texpage.y = ((a_texpage >> 4) & 1u) * 256u; + #if PALETTE + v_texpage.z = ((a_texpage >> 16) & 63u) * 16u; + v_texpage.w = ((a_texpage >> 22) & 511u); + #endif #if UV_LIMITS - v_uv_limits = a_uv_limits * float4(255.0, 255.0, 255.0, 255.0); - #if FORCE_ROUND_TEXCOORDS + v_uv_limits = a_uv_limits * 255.0; + + #if FORCE_ROUND_TEXCOORDS && PALETTE // Add 0.5 to the upper bounds when upscaling, to work around interpolation differences. // Limited to force-round-texcoord hack, to avoid breaking other games. v_uv_limits.zw += 0.5; + #elif !PALETTE + // Treat coordinates as being in upscaled space, and extend the UV range to all "upscaled" + // pixels. This means 1-pixel-high polygon-based framebuffer effects won't be downsampled. + // (e.g. Mega Man Legends 2 haze effect) + v_uv_limits *= float(RESOLUTION_SCALE); + v_uv_limits.zw += float(RESOLUTION_SCALE - 1u); #endif #endif #endif @@ -158,7 +169,7 @@ void GPU_HW_ShaderGen::WriteBatchTextureFilter(std::stringstream& ss, GPUTexture { DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::BilinearBinAlpha); ss << R"( -void FilteredSampleFromVRAM(uint4 texpage, float2 coords, float4 uv_limits, +void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha) { // Compute the coordinates of the four texels we will be interpolating between. @@ -246,7 +257,7 @@ float4 resampler(float4 x) return res; } -void FilteredSampleFromVRAM(uint4 texpage, float2 coords, float4 uv_limits, +void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha) { float4 weights[4]; @@ -392,7 +403,7 @@ float get_left_ratio(float2 center, float2 origin, float2 direction, float2 scal #define P(coord, xoffs, yoffs) SampleFromVRAM(texpage, clamp(coords + float2((xoffs), (yoffs)), uv_limits.xy, uv_limits.zw)) -void FilteredSampleFromVRAM(uint4 texpage, float2 coords, float4 uv_limits, +void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha) { //--------------------------------------- @@ -647,6 +658,8 @@ std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMod DebugAssert(transparency == GPUTransparencyMode::Disabled || render_mode == GPU_HW::BatchRenderMode::ShaderBlend); const bool textured = (texture_mode != GPU_HW::BatchTextureMode::Disabled); + const bool palette = + (texture_mode == GPU_HW::BatchTextureMode::Palette4Bit || texture_mode == GPU_HW::BatchTextureMode::Palette8Bit); const bool shader_blending = (render_mode == GPU_HW::BatchRenderMode::ShaderBlend && (transparency != GPUTransparencyMode::Disabled || check_mask)); const bool use_dual_source = (!shader_blending && m_supports_dual_source_blend && @@ -663,9 +676,7 @@ std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMod DefineMacro(ss, "SHADER_BLENDING", shader_blending); DefineMacro(ss, "CHECK_MASK_BIT", check_mask); DefineMacro(ss, "TEXTURED", textured); - DefineMacro(ss, "PALETTE", - texture_mode == GPU_HW::BatchTextureMode::Palette4Bit || - texture_mode == GPU_HW::BatchTextureMode::Palette8Bit); + DefineMacro(ss, "PALETTE", palette); DefineMacro(ss, "PALETTE_4_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette4Bit); DefineMacro(ss, "PALETTE_8_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette8Bit); DefineMacro(ss, "DITHERING", dithering); @@ -679,6 +690,7 @@ std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMod DefineMacro(ss, "USE_DUAL_SOURCE", use_dual_source); DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords); + DefineMacro(ss, "UPSCALED", m_resolution_scale > 1); WriteCommonFunctions(ss); WriteBatchUniformBuffer(ss); @@ -719,6 +731,12 @@ uint3 ApplyDithering(uint2 coord, uint3 icol) #if TEXTURED CONSTANT float4 TRANSPARENT_PIXEL_COLOR = float4(0.0, 0.0, 0.0, 0.0); +#if PALETTE + #define TEXPAGE_VALUE uint4 +#else + #define TEXPAGE_VALUE uint2 +#endif + uint2 ApplyTextureWindow(uint2 coords) { uint x = (uint(coords.x) & u_texture_window_and.x) | u_texture_window_or.x; @@ -726,13 +744,6 @@ uint2 ApplyTextureWindow(uint2 coords) return uint2(x, y); } -uint2 ApplyUpscaledTextureWindow(uint2 coords) -{ - uint2 native_coords = coords / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); - uint2 coords_offset = coords % uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); - return (ApplyTextureWindow(native_coords) * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) + coords_offset; -} - uint2 FloatToIntegerCoords(float2 coords) { // With the vertex offset applied at 1x resolution scale, we want to round the texture coordinates. @@ -740,42 +751,56 @@ uint2 FloatToIntegerCoords(float2 coords) return uint2((RESOLUTION_SCALE == 1u || FORCE_ROUND_TEXCOORDS != 0) ? roundEven(coords) : floor(coords)); } -float4 SampleFromVRAM(uint4 texpage, float2 coords) +float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords) { #if PALETTE uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords)); - uint2 index_coord = icoord; + + uint2 vicoord; #if PALETTE_4_BIT - index_coord.x /= 4u; + // 4bit will never wrap, since it's in the last texpage row. + vicoord = uint2(texpage.x + (icoord.x / 4u), texpage.y + icoord.y); #elif PALETTE_8_BIT - index_coord.x /= 2u; + // 8bit can wrap in the X direction. + vicoord = uint2((texpage.x + (icoord.x / 2u)) & 0x3FFu, texpage.y + icoord.y); #endif - // fixup coords - uint2 vicoord = texpage.xy + (index_coord * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)); - // load colour/palette - float4 texel = LOAD_TEXTURE(samp0, int2(vicoord), 0); + float4 texel = LOAD_TEXTURE(samp0, int2(vicoord * RESOLUTION_SCALE), 0); uint vram_value = RGBA8ToRGBA5551(texel); // apply palette #if PALETTE_4_BIT uint subpixel = icoord.x & 3u; uint palette_index = (vram_value >> (subpixel * 4u)) & 0x0Fu; - uint2 palette_icoord = uint2((texpage.z + palette_index) * RESOLUTION_SCALE, texpage.w); + uint2 palette_icoord = uint2((texpage.z + palette_index), texpage.w); #elif PALETTE_8_BIT // can only wrap in X direction for 8-bit, 4-bit will fit in texpage size. uint subpixel = icoord.x & 1u; uint palette_index = (vram_value >> (subpixel * 8u)) & 0xFFu; - uint2 palette_icoord = uint2(((texpage.z + palette_index) & 0x3FFu) * RESOLUTION_SCALE, texpage.w); + uint2 palette_icoord = uint2(((texpage.z + palette_index) & 0x3FFu), texpage.w); #endif - return LOAD_TEXTURE(samp0, int2(palette_icoord), 0); + return LOAD_TEXTURE(samp0, int2(palette_icoord * RESOLUTION_SCALE), 0); #else - // Direct texturing. Render-to-texture effects. Use upscaled coordinates. - uint2 icoord = ApplyUpscaledTextureWindow(FloatToIntegerCoords(coords)); - uint2 direct_icoord = texpage.xy + icoord; - return LOAD_TEXTURE(samp0, int2(direct_icoord), 0); + // Direct texturing - usually render-to-texture effects. + uint2 vicoord; + #if !UPSCALED + uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords)); + vicoord = (texpage.xy + icoord) & uint2(1023, 511); + #else + // Coordinates are already upscaled, we need to downscale them to apply the texture + // window, then re-upscale/offset. We can't round here, because it could result in + // going outside of the texture window. + float2 ncoords = coords / float(RESOLUTION_SCALE); + float2 nfpart = frac(ncoords); + uint2 nicoord = ApplyTextureWindow(uint2(floor(ncoords))); + uint2 nvicoord = (texpage.xy + nicoord) & uint2(1023, 511); + coords = (float2(nvicoord) + nfpart) * float(RESOLUTION_SCALE); + vicoord = uint2(floor(coords)); + #endif + + return LOAD_TEXTURE(samp0, int2(vicoord), 0); #endif } @@ -808,15 +833,16 @@ float3 ApplyDebanding(float2 frag_coord) if (uv_limits) { DeclareFragmentEntryPoint(ss, 1, 1, - {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, + {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}, + {"nointerpolation", "float4 v_uv_limits"}}, true, use_dual_source ? 2 : 1, use_dual_source, m_write_mask_as_depth, UsingMSAA(), UsingPerSampleShading(), false, m_disable_color_perspective, shader_blending); } else { - DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "uint4 v_texpage"}}, true, use_dual_source ? 2 : 1, - use_dual_source, m_write_mask_as_depth, UsingMSAA(), UsingPerSampleShading(), false, - m_disable_color_perspective, shader_blending); + DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, true, + use_dual_source ? 2 : 1, use_dual_source, m_write_mask_as_depth, UsingMSAA(), + UsingPerSampleShading(), false, m_disable_color_perspective, shader_blending); } } else @@ -841,34 +867,16 @@ float3 ApplyDebanding(float2 frag_coord) #endif #if TEXTURED - - // We can't currently use upscaled coordinate for palettes because of how they're packed. - // Not that it would be any benefit anyway, render-to-texture effects don't use palettes. - float2 coords = v_tex0; - #if PALETTE - coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE); - #endif - - #if UV_LIMITS - float4 uv_limits = v_uv_limits; - #if !PALETTE - // Extend the UV range to all "upscaled" pixels. This means 1-pixel-high polygon-based - // framebuffer effects won't be downsampled. (e.g. Mega Man Legends 2 haze effect) - uv_limits *= float(RESOLUTION_SCALE); - uv_limits.zw += float(RESOLUTION_SCALE - 1u); - #endif - #endif - float4 texcol; #if TEXTURE_FILTERING - FilteredSampleFromVRAM(v_texpage, coords, uv_limits, texcol, ialpha); + FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha); if (ialpha < 0.5) discard; #else #if UV_LIMITS - texcol = SampleFromVRAM(v_texpage, clamp(coords, uv_limits.xy, uv_limits.zw)); + texcol = SampleFromVRAM(v_texpage, clamp(v_tex0, v_uv_limits.xy, v_uv_limits.zw)); #else - texcol = SampleFromVRAM(v_texpage, coords); + texcol = SampleFromVRAM(v_texpage, v_tex0); #endif if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR)) discard; diff --git a/src/core/gpu_hw_shadergen.h b/src/core/gpu_hw_shadergen.h index f87bf2bcf..f456392ac 100644 --- a/src/core/gpu_hw_shadergen.h +++ b/src/core/gpu_hw_shadergen.h @@ -13,7 +13,8 @@ public: bool supports_dual_source_blend, bool supports_framebuffer_fetch, bool debanding); ~GPU_HW_ShaderGen(); - std::string GenerateBatchVertexShader(bool textured, bool uv_limits, bool force_round_texcoords, bool pgxp_depth); + std::string GenerateBatchVertexShader(bool textured, bool palette, bool uv_limits, bool force_round_texcoords, + bool pgxp_depth); std::string GenerateBatchFragmentShader(GPU_HW::BatchRenderMode render_mode, GPUTransparencyMode transparency, GPU_HW::BatchTextureMode texture_mode, GPUTextureFilter texture_filtering, bool uv_limits, bool force_round_texcoords, bool dithering, bool interlacing, diff --git a/src/core/shader_cache_version.h b/src/core/shader_cache_version.h index 8e9b643c0..5217c0dfb 100644 --- a/src/core/shader_cache_version.h +++ b/src/core/shader_cache_version.h @@ -4,4 +4,4 @@ #pragma once #include "common/types.h" -static constexpr u32 SHADER_CACHE_VERSION = 16; +static constexpr u32 SHADER_CACHE_VERSION = 17;