Duckstation/src/core/gpu_hw_shadergen.cpp
2024-09-01 23:10:55 +10:00

1733 lines
61 KiB
C++

// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: PolyForm-Strict-1.0.0
//
// NOTE: Some parts of this file have more permissive licenses. They are marked appropriately.
//
#include "gpu_hw_shadergen.h"
#include "common/assert.h"
GPU_HW_ShaderGen::GPU_HW_ShaderGen(RenderAPI render_api, u32 resolution_scale, u32 multisamples,
bool per_sample_shading, bool true_color, bool scaled_dithering,
bool write_mask_as_depth, bool disable_color_perspective,
bool supports_dual_source_blend, bool supports_framebuffer_fetch, bool debanding)
: ShaderGen(render_api, GetShaderLanguageForAPI(render_api), supports_dual_source_blend, supports_framebuffer_fetch),
m_resolution_scale(resolution_scale), m_multisamples(multisamples), m_per_sample_shading(per_sample_shading),
m_true_color(true_color), m_scaled_dithering(scaled_dithering), m_write_mask_as_depth(write_mask_as_depth),
m_disable_color_perspective(disable_color_perspective), m_debanding(debanding)
{
}
GPU_HW_ShaderGen::~GPU_HW_ShaderGen() = default;
void GPU_HW_ShaderGen::WriteCommonFunctions(std::stringstream& ss)
{
DefineMacro(ss, "MULTISAMPLING", UsingMSAA());
ss << "CONSTANT uint RESOLUTION_SCALE = " << m_resolution_scale << "u;\n";
ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n";
ss << "CONSTANT uint MULTISAMPLES = " << m_multisamples << "u;\n";
ss << "CONSTANT bool PER_SAMPLE_SHADING = " << (m_per_sample_shading ? "true" : "false") << ";\n";
ss << R"(
uint RGBA8ToRGBA5551(float4 v)
{
uint r = uint(roundEven(v.r * 31.0));
uint g = uint(roundEven(v.g * 31.0));
uint b = uint(roundEven(v.b * 31.0));
uint a = (v.a != 0.0) ? 1u : 0u;
return (r) | (g << 5) | (b << 10) | (a << 15);
}
float4 RGBA5551ToRGBA8(uint v)
{
uint r = (v & 31u);
uint g = ((v >> 5) & 31u);
uint b = ((v >> 10) & 31u);
uint a = ((v >> 15) & 1u);
return float4(float(r) / 31.0, float(g) / 31.0, float(b) / 31.0, float(a));
}
)";
}
void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss)
{
DeclareUniformBuffer(ss,
{"uint2 u_texture_window_and", "uint2 u_texture_window_or", "float u_src_alpha_factor",
"float u_dst_alpha_factor", "uint u_interlaced_displayed_field",
"bool u_set_mask_while_drawing"},
false);
}
std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool palette, bool uv_limits,
bool force_round_texcoords, bool pgxp_depth)
{
std::stringstream ss;
WriteHeader(ss);
DefineMacro(ss, "TEXTURED", textured);
DefineMacro(ss, "PALETTE", palette);
DefineMacro(ss, "UV_LIMITS", uv_limits);
DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords);
DefineMacro(ss, "PGXP_DEPTH", pgxp_depth);
WriteCommonFunctions(ss);
WriteBatchUniformBuffer(ss);
if (textured)
{
if (uv_limits)
{
DeclareVertexEntryPoint(
ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1,
{{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"},
{"nointerpolation", "float4 v_uv_limits"}},
false, "", UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective);
}
else
{
DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1,
{{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, false, "",
UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective);
}
}
else
{
DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0"}, 1, 0, {}, false, "", UsingMSAA(),
UsingPerSampleShading(), m_disable_color_perspective);
}
ss << R"(
{
// Offset the vertex position by 0.5 to ensure correct interpolation of texture coordinates
// at 1x resolution scale. This doesn't work at >1x, we adjust the texture coordinates before
// uploading there instead.
float vertex_offset = (RESOLUTION_SCALE == 1u) ? 0.5 : 0.0;
// 0..+1023 -> -1..1
float pos_x = ((a_pos.x + vertex_offset) / 512.0) - 1.0;
float pos_y = ((a_pos.y + vertex_offset) / -256.0) + 1.0;
#if PGXP_DEPTH
// Ignore mask Z when using PGXP depth.
float pos_z = a_pos.w;
float pos_w = a_pos.w;
#else
float pos_z = a_pos.z;
float pos_w = a_pos.w;
#endif
#if API_OPENGL || API_OPENGL_ES
// 0..1 to -1..1 depth range.
pos_z = (pos_z * 2.0) - 1.0;
#endif
// NDC space Y flip in Vulkan.
#if API_OPENGL || API_OPENGL_ES || API_VULKAN
pos_y = -pos_y;
#endif
v_pos = float4(pos_x * pos_w, pos_y * pos_w, pos_z * pos_w, pos_w);
v_col0 = a_col0;
#if TEXTURED
v_tex0 = float2(uint2(a_texcoord & 0xFFFFu, a_texcoord >> 16));
#if !PALETTE
v_tex0 *= float(RESOLUTION_SCALE);
#endif
// base_x,base_y,palette_x,palette_y
v_texpage.x = (a_texpage & 15u) * 64u;
v_texpage.y = ((a_texpage >> 4) & 1u) * 256u;
#if PALETTE
v_texpage.z = ((a_texpage >> 16) & 63u) * 16u;
v_texpage.w = ((a_texpage >> 22) & 511u);
#endif
#if UV_LIMITS
v_uv_limits = a_uv_limits * 255.0;
#if FORCE_ROUND_TEXCOORDS && PALETTE
// Add 0.5 to the upper bounds when upscaling, to work around interpolation differences.
// Limited to force-round-texcoord hack, to avoid breaking other games.
v_uv_limits.zw += 0.5;
#elif !PALETTE
// Treat coordinates as being in upscaled space, and extend the UV range to all "upscaled"
// pixels. This means 1-pixel-high polygon-based framebuffer effects won't be downsampled.
// (e.g. Mega Man Legends 2 haze effect)
v_uv_limits *= float(RESOLUTION_SCALE);
v_uv_limits.zw += float(RESOLUTION_SCALE - 1u);
#endif
#endif
#endif
}
)";
return ss.str();
}
void GPU_HW_ShaderGen::WriteBatchTextureFilter(std::stringstream& ss, GPUTextureFilter texture_filter)
{
// JINC2 and xBRZ shaders originally from beetle-psx, modified to support filtering mask channel.
if (texture_filter == GPUTextureFilter::Bilinear || texture_filter == GPUTextureFilter::BilinearBinAlpha)
{
DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::BilinearBinAlpha);
ss << R"(
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
out float4 texcol, out float ialpha)
{
// Compute the coordinates of the four texels we will be interpolating between.
// Clamp this to the triangle texture coordinates.
float2 texel_top_left = frac(coords) - float2(0.5, 0.5);
float2 texel_offset = sign(texel_top_left);
float4 fcoords = max(coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y),
float4(0.0, 0.0, 0.0, 0.0));
// Load four texels.
float4 s00 = SampleFromVRAM(texpage, clamp(fcoords.xy, uv_limits.xy, uv_limits.zw));
float4 s10 = SampleFromVRAM(texpage, clamp(fcoords.zy, uv_limits.xy, uv_limits.zw));
float4 s01 = SampleFromVRAM(texpage, clamp(fcoords.xw, uv_limits.xy, uv_limits.zw));
float4 s11 = SampleFromVRAM(texpage, clamp(fcoords.zw, uv_limits.xy, uv_limits.zw));
// Compute alpha from how many texels aren't pixel color 0000h.
float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR));
float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR));
float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR));
float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR));
// Bilinearly interpolate.
float2 weights = abs(texel_top_left);
texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y);
ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y);
// Compensate for partially transparent sampling.
if (ialpha > 0.0)
texcol.rgb /= float3(ialpha, ialpha, ialpha);
#if BINALPHA
ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
#endif
}
)";
}
else if (texture_filter == GPUTextureFilter::JINC2 || texture_filter == GPUTextureFilter::JINC2BinAlpha)
{
/*
Hyllian's jinc windowed-jinc 2-lobe sharper with anti-ringing Shader
Copyright (C) 2011-2016 Hyllian/Jararaca - sergiogdb@gmail.com
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::JINC2BinAlpha);
ss << R"(
CONSTANT float JINC2_WINDOW_SINC = 0.44;
CONSTANT float JINC2_SINC = 0.82;
CONSTANT float JINC2_AR_STRENGTH = 0.8;
CONSTANT float halfpi = 1.5707963267948966192313216916398;
CONSTANT float pi = 3.1415926535897932384626433832795;
CONSTANT float wa = 1.382300768;
CONSTANT float wb = 2.576105976;
// Calculates the distance between two points
float d(float2 pt1, float2 pt2)
{
float2 v = pt2 - pt1;
return sqrt(dot(v,v));
}
float min4(float a, float b, float c, float d)
{
return min(a, min(b, min(c, d)));
}
float4 min4(float4 a, float4 b, float4 c, float4 d)
{
return min(a, min(b, min(c, d)));
}
float max4(float a, float b, float c, float d)
{
return max(a, max(b, max(c, d)));
}
float4 max4(float4 a, float4 b, float4 c, float4 d)
{
return max(a, max(b, max(c, d)));
}
float4 resampler(float4 x)
{
float4 res;
// res = (x==float4(0.0, 0.0, 0.0, 0.0)) ? float4(wa*wb) : sin(x*wa)*sin(x*wb)/(x*x);
// Need to use mix(.., equal(..)) since we want zero check to be component wise
res = lerp(sin(x*wa)*sin(x*wb)/(x*x), float4(wa*wb, wa*wb, wa*wb, wa*wb), VECTOR_COMP_EQ(x,float4(0.0, 0.0, 0.0, 0.0)));
return res;
}
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
out float4 texcol, out float ialpha)
{
float4 weights[4];
float2 dx = float2(1.0, 0.0);
float2 dy = float2(0.0, 1.0);
float2 pc = coords.xy;
float2 tc = (floor(pc-float2(0.5,0.5))+float2(0.5,0.5));
weights[0] = resampler(float4(d(pc, tc -dx -dy), d(pc, tc -dy), d(pc, tc +dx -dy), d(pc, tc+2.0*dx -dy)));
weights[1] = resampler(float4(d(pc, tc -dx ), d(pc, tc ), d(pc, tc +dx ), d(pc, tc+2.0*dx )));
weights[2] = resampler(float4(d(pc, tc -dx +dy), d(pc, tc +dy), d(pc, tc +dx +dy), d(pc, tc+2.0*dx +dy)));
weights[3] = resampler(float4(d(pc, tc -dx+2.0*dy), d(pc, tc +2.0*dy), d(pc, tc +dx+2.0*dy), d(pc, tc+2.0*dx+2.0*dy)));
dx = dx;
dy = dy;
tc = tc;
#define sample_texel(coords) SampleFromVRAM(texpage, clamp((coords), uv_limits.xy, uv_limits.zw))
float4 c00 = sample_texel(tc -dx -dy);
float a00 = float(VECTOR_NEQ(c00, TRANSPARENT_PIXEL_COLOR));
float4 c10 = sample_texel(tc -dy);
float a10 = float(VECTOR_NEQ(c10, TRANSPARENT_PIXEL_COLOR));
float4 c20 = sample_texel(tc +dx -dy);
float a20 = float(VECTOR_NEQ(c20, TRANSPARENT_PIXEL_COLOR));
float4 c30 = sample_texel(tc+2.0*dx -dy);
float a30 = float(VECTOR_NEQ(c30, TRANSPARENT_PIXEL_COLOR));
float4 c01 = sample_texel(tc -dx );
float a01 = float(VECTOR_NEQ(c01, TRANSPARENT_PIXEL_COLOR));
float4 c11 = sample_texel(tc );
float a11 = float(VECTOR_NEQ(c11, TRANSPARENT_PIXEL_COLOR));
float4 c21 = sample_texel(tc +dx );
float a21 = float(VECTOR_NEQ(c21, TRANSPARENT_PIXEL_COLOR));
float4 c31 = sample_texel(tc+2.0*dx );
float a31 = float(VECTOR_NEQ(c31, TRANSPARENT_PIXEL_COLOR));
float4 c02 = sample_texel(tc -dx +dy);
float a02 = float(VECTOR_NEQ(c02, TRANSPARENT_PIXEL_COLOR));
float4 c12 = sample_texel(tc +dy);
float a12 = float(VECTOR_NEQ(c12, TRANSPARENT_PIXEL_COLOR));
float4 c22 = sample_texel(tc +dx +dy);
float a22 = float(VECTOR_NEQ(c22, TRANSPARENT_PIXEL_COLOR));
float4 c32 = sample_texel(tc+2.0*dx +dy);
float a32 = float(VECTOR_NEQ(c32, TRANSPARENT_PIXEL_COLOR));
float4 c03 = sample_texel(tc -dx+2.0*dy);
float a03 = float(VECTOR_NEQ(c03, TRANSPARENT_PIXEL_COLOR));
float4 c13 = sample_texel(tc +2.0*dy);
float a13 = float(VECTOR_NEQ(c13, TRANSPARENT_PIXEL_COLOR));
float4 c23 = sample_texel(tc +dx+2.0*dy);
float a23 = float(VECTOR_NEQ(c23, TRANSPARENT_PIXEL_COLOR));
float4 c33 = sample_texel(tc+2.0*dx+2.0*dy);
float a33 = float(VECTOR_NEQ(c33, TRANSPARENT_PIXEL_COLOR));
#undef sample_texel
// Get min/max samples
float4 min_sample = min4(c11, c21, c12, c22);
float min_sample_alpha = min4(a11, a21, a12, a22);
float4 max_sample = max4(c11, c21, c12, c22);
float max_sample_alpha = max4(a11, a21, a12, a22);
float4 color;
color = float4(dot(weights[0], float4(c00.x, c10.x, c20.x, c30.x)), dot(weights[0], float4(c00.y, c10.y, c20.y, c30.y)), dot(weights[0], float4(c00.z, c10.z, c20.z, c30.z)), dot(weights[0], float4(c00.w, c10.w, c20.w, c30.w)));
color+= float4(dot(weights[1], float4(c01.x, c11.x, c21.x, c31.x)), dot(weights[1], float4(c01.y, c11.y, c21.y, c31.y)), dot(weights[1], float4(c01.z, c11.z, c21.z, c31.z)), dot(weights[1], float4(c01.w, c11.w, c21.w, c31.w)));
color+= float4(dot(weights[2], float4(c02.x, c12.x, c22.x, c32.x)), dot(weights[2], float4(c02.y, c12.y, c22.y, c32.y)), dot(weights[2], float4(c02.z, c12.z, c22.z, c32.z)), dot(weights[2], float4(c02.w, c12.w, c22.w, c32.w)));
color+= float4(dot(weights[3], float4(c03.x, c13.x, c23.x, c33.x)), dot(weights[3], float4(c03.y, c13.y, c23.y, c33.y)), dot(weights[3], float4(c03.z, c13.z, c23.z, c33.z)), dot(weights[3], float4(c03.w, c13.w, c23.w, c33.w)));
color = color/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1)));
float alpha;
alpha = dot(weights[0], float4(a00, a10, a20, a30));
alpha+= dot(weights[1], float4(a01, a11, a21, a31));
alpha+= dot(weights[2], float4(a02, a12, a22, a32));
alpha+= dot(weights[3], float4(a03, a13, a23, a33));
//alpha = alpha/(weights[0].w + weights[1].w + weights[2].w + weights[3].w);
alpha = alpha/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1)));
// Anti-ringing
float4 aux = color;
float aux_alpha = alpha;
color = clamp(color, min_sample, max_sample);
alpha = clamp(alpha, min_sample_alpha, max_sample_alpha);
color = lerp(aux, color, JINC2_AR_STRENGTH);
alpha = lerp(aux_alpha, alpha, JINC2_AR_STRENGTH);
// final sum and weight normalization
ialpha = alpha;
texcol = color;
// Compensate for partially transparent sampling.
if (ialpha > 0.0)
texcol.rgb /= float3(ialpha, ialpha, ialpha);
#if BINALPHA
ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
#endif
}
)";
}
else if (texture_filter == GPUTextureFilter::xBR || texture_filter == GPUTextureFilter::xBRBinAlpha)
{
/*
Hyllian's xBR-vertex code and texel mapping
Copyright (C) 2011/2016 Hyllian - sergiogdb@gmail.com
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::xBRBinAlpha);
ss << R"(
CONSTANT int BLEND_NONE = 0;
CONSTANT int BLEND_NORMAL = 1;
CONSTANT int BLEND_DOMINANT = 2;
CONSTANT float LUMINANCE_WEIGHT = 1.0;
CONSTANT float EQUAL_COLOR_TOLERANCE = 0.1176470588235294;
CONSTANT float STEEP_DIRECTION_THRESHOLD = 2.2;
CONSTANT float DOMINANT_DIRECTION_THRESHOLD = 3.6;
CONSTANT float4 w = float4(0.2627, 0.6780, 0.0593, 0.5);
float DistYCbCr(float4 pixA, float4 pixB)
{
const float scaleB = 0.5 / (1.0 - w.b);
const float scaleR = 0.5 / (1.0 - w.r);
float4 diff = pixA - pixB;
float Y = dot(diff, w);
float Cb = scaleB * (diff.b - Y);
float Cr = scaleR * (diff.r - Y);
return sqrt(((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr));
}
bool IsPixEqual(const float4 pixA, const float4 pixB)
{
return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE);
}
float get_left_ratio(float2 center, float2 origin, float2 direction, float2 scale)
{
float2 P0 = center - origin;
float2 proj = direction * (dot(P0, direction) / dot(direction, direction));
float2 distv = P0 - proj;
float2 orth = float2(-direction.y, direction.x);
float side = sign(dot(P0, orth));
float v = side * length(distv * scale);
// return step(0, v);
return smoothstep(-sqrt(2.0)/2.0, sqrt(2.0)/2.0, v);
}
#define P(coord, xoffs, yoffs) SampleFromVRAM(texpage, clamp(coords + float2((xoffs), (yoffs)), uv_limits.xy, uv_limits.zw))
void FilteredSampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords, float4 uv_limits,
out float4 texcol, out float ialpha)
{
//---------------------------------------
// Input Pixel Mapping: -|x|x|x|-
// x|A|B|C|x
// x|D|E|F|x
// x|G|H|I|x
// -|x|x|x|-
float2 scale = float2(8.0, 8.0);
float2 pos = frac(coords.xy) - float2(0.5, 0.5);
float2 coord = coords.xy - pos;
float4 A = P(coord, -1,-1);
float Aw = A.w;
A.w = float(VECTOR_NEQ(A, TRANSPARENT_PIXEL_COLOR));
float4 B = P(coord, 0,-1);
float Bw = B.w;
B.w = float(VECTOR_NEQ(B, TRANSPARENT_PIXEL_COLOR));
float4 C = P(coord, 1,-1);
float Cw = C.w;
C.w = float(VECTOR_NEQ(C, TRANSPARENT_PIXEL_COLOR));
float4 D = P(coord, -1, 0);
float Dw = D.w;
D.w = float(VECTOR_NEQ(D, TRANSPARENT_PIXEL_COLOR));
float4 E = P(coord, 0, 0);
float Ew = E.w;
E.w = float(VECTOR_NEQ(E, TRANSPARENT_PIXEL_COLOR));
float4 F = P(coord, 1, 0);
float Fw = F.w;
F.w = float(VECTOR_NEQ(F, TRANSPARENT_PIXEL_COLOR));
float4 G = P(coord, -1, 1);
float Gw = G.w;
G.w = float(VECTOR_NEQ(G, TRANSPARENT_PIXEL_COLOR));
float4 H = P(coord, 0, 1);
float Hw = H.w;
H.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR));
float4 I = P(coord, 1, 1);
float Iw = I.w;
I.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR));
// blendResult Mapping: x|y|
// w|z|
int4 blendResult = int4(BLEND_NONE,BLEND_NONE,BLEND_NONE,BLEND_NONE);
// Preprocess corners
// Pixel Tap Mapping: -|-|-|-|-
// -|-|B|C|-
// -|D|E|F|x
// -|G|H|I|x
// -|-|x|x|-
if (!((VECTOR_EQ(E,F) && VECTOR_EQ(H,I)) || (VECTOR_EQ(E,H) && VECTOR_EQ(F,I))))
{
float dist_H_F = DistYCbCr(G, E) + DistYCbCr(E, C) + DistYCbCr(P(coord, 0,2), I) + DistYCbCr(I, P(coord, 2,0)) + (4.0 * DistYCbCr(H, F));
float dist_E_I = DistYCbCr(D, H) + DistYCbCr(H, P(coord, 1,2)) + DistYCbCr(B, F) + DistYCbCr(F, P(coord, 2,1)) + (4.0 * DistYCbCr(E, I));
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_H_F) < dist_E_I;
blendResult.z = ((dist_H_F < dist_E_I) && VECTOR_NEQ(E,F) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
}
// Pixel Tap Mapping: -|-|-|-|-
// -|A|B|-|-
// x|D|E|F|-
// x|G|H|I|-
// -|x|x|-|-
if (!((VECTOR_EQ(D,E) && VECTOR_EQ(G,H)) || (VECTOR_EQ(D,G) && VECTOR_EQ(E,H))))
{
float dist_G_E = DistYCbCr(P(coord, -2,1) , D) + DistYCbCr(D, B) + DistYCbCr(P(coord, -1,2), H) + DistYCbCr(H, F) + (4.0 * DistYCbCr(G, E));
float dist_D_H = DistYCbCr(P(coord, -2,0) , G) + DistYCbCr(G, P(coord, 0,2)) + DistYCbCr(A, E) + DistYCbCr(E, I) + (4.0 * DistYCbCr(D, H));
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_H) < dist_G_E;
blendResult.w = ((dist_G_E > dist_D_H) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
}
// Pixel Tap Mapping: -|-|x|x|-
// -|A|B|C|x
// -|D|E|F|x
// -|-|H|I|-
// -|-|-|-|-
if (!((VECTOR_EQ(B,C) && VECTOR_EQ(E,F)) || (VECTOR_EQ(B,E) && VECTOR_EQ(C,F))))
{
float dist_E_C = DistYCbCr(D, B) + DistYCbCr(B, P(coord, 1,-2)) + DistYCbCr(H, F) + DistYCbCr(F, P(coord, 2,-1)) + (4.0 * DistYCbCr(E, C));
float dist_B_F = DistYCbCr(A, E) + DistYCbCr(E, I) + DistYCbCr(P(coord, 0,-2), C) + DistYCbCr(C, P(coord, 2,0)) + (4.0 * DistYCbCr(B, F));
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_B_F) < dist_E_C;
blendResult.y = ((dist_E_C > dist_B_F) && VECTOR_NEQ(E,B) && VECTOR_NEQ(E,F)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
}
// Pixel Tap Mapping: -|x|x|-|-
// x|A|B|C|-
// x|D|E|F|-
// -|G|H|-|-
// -|-|-|-|-
if (!((VECTOR_EQ(A,B) && VECTOR_EQ(D,E)) || (VECTOR_EQ(A,D) && VECTOR_EQ(B,E))))
{
float dist_D_B = DistYCbCr(P(coord, -2,0), A) + DistYCbCr(A, P(coord, 0,-2)) + DistYCbCr(G, E) + DistYCbCr(E, C) + (4.0 * DistYCbCr(D, B));
float dist_A_E = DistYCbCr(P(coord, -2,-1), D) + DistYCbCr(D, H) + DistYCbCr(P(coord, -1,-2), B) + DistYCbCr(B, F) + (4.0 * DistYCbCr(A, E));
bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_B) < dist_A_E;
blendResult.x = ((dist_D_B < dist_A_E) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,B)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE;
}
float4 res = E;
float resW = Ew;
// Pixel Tap Mapping: -|-|-|-|-
// -|-|B|C|-
// -|D|E|F|x
// -|G|H|I|x
// -|-|x|x|-
if(blendResult.z != BLEND_NONE)
{
float dist_F_G = DistYCbCr(F, G);
float dist_H_C = DistYCbCr(H, C);
bool doLineBlend = (blendResult.z == BLEND_DOMINANT ||
!((blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) || (blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) ||
(IsPixEqual(G, H) && IsPixEqual(H, I) && IsPixEqual(I, F) && IsPixEqual(F, C) && !IsPixEqual(E, I))));
float2 origin = float2(0.0, 1.0 / sqrt(2.0));
float2 direction = float2(1.0, -1.0);
if(doLineBlend)
{
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_F_G <= dist_H_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(D,G);
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_H_C <= dist_F_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(B,C);
origin = haveShallowLine? float2(0.0, 0.25) : float2(0.0, 0.5);
direction.x += haveShallowLine? 1.0: 0.0;
direction.y -= haveSteepLine? 1.0: 0.0;
}
float4 blendPix = lerp(H,F, step(DistYCbCr(E, F), DistYCbCr(E, H)));
float blendW = lerp(Hw,Fw, step(DistYCbCr(E, F), DistYCbCr(E, H)));
res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
}
// Pixel Tap Mapping: -|-|-|-|-
// -|A|B|-|-
// x|D|E|F|-
// x|G|H|I|-
// -|x|x|-|-
if(blendResult.w != BLEND_NONE)
{
float dist_H_A = DistYCbCr(H, A);
float dist_D_I = DistYCbCr(D, I);
bool doLineBlend = (blendResult.w == BLEND_DOMINANT ||
!((blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) || (blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) ||
(IsPixEqual(A, D) && IsPixEqual(D, G) && IsPixEqual(G, H) && IsPixEqual(H, I) && !IsPixEqual(E, G))));
float2 origin = float2(-1.0 / sqrt(2.0), 0.0);
float2 direction = float2(1.0, 1.0);
if(doLineBlend)
{
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_H_A <= dist_D_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(B,A);
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_D_I <= dist_H_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(F,I);
origin = haveShallowLine? float2(-0.25, 0.0) : float2(-0.5, 0.0);
direction.y += haveShallowLine? 1.0: 0.0;
direction.x += haveSteepLine? 1.0: 0.0;
}
origin = origin;
direction = direction;
float4 blendPix = lerp(H,D, step(DistYCbCr(E, D), DistYCbCr(E, H)));
float blendW = lerp(Hw,Dw, step(DistYCbCr(E, D), DistYCbCr(E, H)));
res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
}
// Pixel Tap Mapping: -|-|x|x|-
// -|A|B|C|x
// -|D|E|F|x
// -|-|H|I|-
// -|-|-|-|-
if(blendResult.y != BLEND_NONE)
{
float dist_B_I = DistYCbCr(B, I);
float dist_F_A = DistYCbCr(F, A);
bool doLineBlend = (blendResult.y == BLEND_DOMINANT ||
!((blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) || (blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) ||
(IsPixEqual(I, F) && IsPixEqual(F, C) && IsPixEqual(C, B) && IsPixEqual(B, A) && !IsPixEqual(E, C))));
float2 origin = float2(1.0 / sqrt(2.0), 0.0);
float2 direction = float2(-1.0, -1.0);
if(doLineBlend)
{
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_B_I <= dist_F_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(H,I);
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_F_A <= dist_B_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(D,A);
origin = haveShallowLine? float2(0.25, 0.0) : float2(0.5, 0.0);
direction.y -= haveShallowLine? 1.0: 0.0;
direction.x -= haveSteepLine? 1.0: 0.0;
}
float4 blendPix = lerp(F,B, step(DistYCbCr(E, B), DistYCbCr(E, F)));
float blendW = lerp(Fw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, F)));
res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
}
// Pixel Tap Mapping: -|x|x|-|-
// x|A|B|C|-
// x|D|E|F|-
// -|G|H|-|-
// -|-|-|-|-
if(blendResult.x != BLEND_NONE)
{
float dist_D_C = DistYCbCr(D, C);
float dist_B_G = DistYCbCr(B, G);
bool doLineBlend = (blendResult.x == BLEND_DOMINANT ||
!((blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) || (blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) ||
(IsPixEqual(C, B) && IsPixEqual(B, A) && IsPixEqual(A, D) && IsPixEqual(D, G) && !IsPixEqual(E, A))));
float2 origin = float2(0.0, -1.0 / sqrt(2.0));
float2 direction = float2(-1.0, 1.0);
if(doLineBlend)
{
bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_D_C <= dist_B_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(F,C);
bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_B_G <= dist_D_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(H,G);
origin = haveShallowLine? float2(0.0, -0.25) : float2(0.0, -0.5);
direction.x -= haveShallowLine? 1.0: 0.0;
direction.y += haveSteepLine? 1.0: 0.0;
}
float4 blendPix = lerp(D,B, step(DistYCbCr(E, B), DistYCbCr(E, D)));
float blendW = lerp(Dw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, D)));
res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale));
resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale));
}
ialpha = res.w;
texcol = float4(res.xyz, resW);
// Compensate for partially transparent sampling.
if (ialpha > 0.0)
texcol.rgb /= float3(ialpha, ialpha, ialpha);
#if BINALPHA
ialpha = (ialpha >= 0.5) ? 1.0 : 0.0;
#endif
}
#undef P
)";
}
}
std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(
GPU_HW::BatchRenderMode render_mode, GPUTransparencyMode transparency, GPU_HW::BatchTextureMode texture_mode,
GPUTextureFilter texture_filtering, bool uv_limits, bool force_round_texcoords, bool dithering, bool interlacing,
bool check_mask, bool use_rov, bool use_rov_depth, bool rov_depth_test)
{
// TODO: don't write depth for shader blend
DebugAssert(transparency == GPUTransparencyMode::Disabled || render_mode == GPU_HW::BatchRenderMode::ShaderBlend);
DebugAssert(!rov_depth_test || (use_rov && use_rov_depth));
const bool textured = (texture_mode != GPU_HW::BatchTextureMode::Disabled);
const bool palette =
(texture_mode == GPU_HW::BatchTextureMode::Palette4Bit || texture_mode == GPU_HW::BatchTextureMode::Palette8Bit);
const bool shader_blending = (render_mode == GPU_HW::BatchRenderMode::ShaderBlend);
const bool use_dual_source = (!shader_blending && !use_rov && m_supports_dual_source_blend &&
((render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled &&
render_mode != GPU_HW::BatchRenderMode::OnlyOpaque) ||
texture_filtering != GPUTextureFilter::Nearest));
std::stringstream ss;
WriteHeader(ss, use_rov);
DefineMacro(ss, "TRANSPARENCY", render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled);
DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", render_mode == GPU_HW::BatchRenderMode::OnlyOpaque);
DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENT", render_mode == GPU_HW::BatchRenderMode::OnlyTransparent);
DefineMacro(ss, "TRANSPARENCY_MODE", static_cast<s32>(transparency));
DefineMacro(ss, "SHADER_BLENDING", shader_blending);
DefineMacro(ss, "CHECK_MASK_BIT", check_mask);
DefineMacro(ss, "TEXTURED", textured);
DefineMacro(ss, "PALETTE", palette);
DefineMacro(ss, "PALETTE_4_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette4Bit);
DefineMacro(ss, "PALETTE_8_BIT", texture_mode == GPU_HW::BatchTextureMode::Palette8Bit);
DefineMacro(ss, "DITHERING", dithering);
DefineMacro(ss, "DITHERING_SCALED", m_scaled_dithering);
// Debanding requires true color to work correctly.
DefineMacro(ss, "DEBANDING", m_true_color && m_debanding);
DefineMacro(ss, "INTERLACING", interlacing);
DefineMacro(ss, "TRUE_COLOR", m_true_color);
DefineMacro(ss, "TEXTURE_FILTERING", texture_filtering != GPUTextureFilter::Nearest);
DefineMacro(ss, "UV_LIMITS", uv_limits);
DefineMacro(ss, "USE_ROV", use_rov);
DefineMacro(ss, "USE_ROV_DEPTH", use_rov_depth);
DefineMacro(ss, "ROV_DEPTH_TEST", rov_depth_test);
DefineMacro(ss, "USE_DUAL_SOURCE", use_dual_source);
DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth);
DefineMacro(ss, "FORCE_ROUND_TEXCOORDS", force_round_texcoords);
DefineMacro(ss, "UPSCALED", m_resolution_scale > 1);
WriteCommonFunctions(ss);
WriteBatchUniformBuffer(ss);
DeclareTexture(ss, "samp0", 0);
if (use_rov)
{
DeclareImage(ss, "rov_color", 0);
if (use_rov_depth)
DeclareImage(ss, "rov_depth", 1, true);
}
if (m_glsl)
ss << "CONSTANT int[16] s_dither_values = int[16]( ";
else
ss << "CONSTANT int s_dither_values[] = {";
for (u32 i = 0; i < 16; i++)
{
if (i > 0)
ss << ", ";
ss << DITHER_MATRIX[i / 4][i % 4];
}
if (m_glsl)
ss << " );\n";
else
ss << "};\n";
ss << R"(
uint3 ApplyDithering(uint2 coord, uint3 icol)
{
#if DITHERING_SCALED
uint2 fc = coord & uint2(3u, 3u);
#else
uint2 fc = (coord / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & uint2(3u, 3u);
#endif
int offset = s_dither_values[fc.y * 4u + fc.x];
#if !TRUE_COLOR
return uint3(clamp((int3(icol) + int3(offset, offset, offset)) >> 3, 0, 31));
#else
return uint3(clamp(int3(icol) + int3(offset, offset, offset), 0, 255));
#endif
}
#if TEXTURED
CONSTANT float4 TRANSPARENT_PIXEL_COLOR = float4(0.0, 0.0, 0.0, 0.0);
#if PALETTE
#define TEXPAGE_VALUE uint4
#else
#define TEXPAGE_VALUE uint2
#endif
uint2 ApplyTextureWindow(uint2 coords)
{
uint x = (uint(coords.x) & u_texture_window_and.x) | u_texture_window_or.x;
uint y = (uint(coords.y) & u_texture_window_and.y) | u_texture_window_or.y;
return uint2(x, y);
}
uint2 FloatToIntegerCoords(float2 coords)
{
// With the vertex offset applied at 1x resolution scale, we want to round the texture coordinates.
// Floor them otherwise, as it currently breaks when upscaling as the vertex offset is not applied.
return uint2((RESOLUTION_SCALE == 1u || FORCE_ROUND_TEXCOORDS != 0) ? roundEven(coords) : floor(coords));
}
float4 SampleFromVRAM(TEXPAGE_VALUE texpage, float2 coords)
{
#if PALETTE
uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
uint2 vicoord;
#if PALETTE_4_BIT
// 4bit will never wrap, since it's in the last texpage row.
vicoord = uint2(texpage.x + (icoord.x / 4u), texpage.y + icoord.y);
#elif PALETTE_8_BIT
// 8bit can wrap in the X direction.
vicoord = uint2((texpage.x + (icoord.x / 2u)) & 0x3FFu, texpage.y + icoord.y);
#endif
// load colour/palette
float4 texel = LOAD_TEXTURE(samp0, int2(vicoord * RESOLUTION_SCALE), 0);
uint vram_value = RGBA8ToRGBA5551(texel);
// apply palette
#if PALETTE_4_BIT
uint subpixel = icoord.x & 3u;
uint palette_index = (vram_value >> (subpixel * 4u)) & 0x0Fu;
uint2 palette_icoord = uint2((texpage.z + palette_index), texpage.w);
#elif PALETTE_8_BIT
// can only wrap in X direction for 8-bit, 4-bit will fit in texpage size.
uint subpixel = icoord.x & 1u;
uint palette_index = (vram_value >> (subpixel * 8u)) & 0xFFu;
uint2 palette_icoord = uint2(((texpage.z + palette_index) & 0x3FFu), texpage.w);
#endif
return LOAD_TEXTURE(samp0, int2(palette_icoord * RESOLUTION_SCALE), 0);
#else
// Direct texturing - usually render-to-texture effects.
uint2 vicoord;
#if !UPSCALED
uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords));
vicoord = (texpage.xy + icoord) & uint2(1023, 511);
#else
// Coordinates are already upscaled, we need to downscale them to apply the texture
// window, then re-upscale/offset. We can't round here, because it could result in
// going outside of the texture window.
float2 ncoords = coords / float(RESOLUTION_SCALE);
float2 nfpart = frac(ncoords);
uint2 nicoord = ApplyTextureWindow(uint2(floor(ncoords)));
uint2 nvicoord = (texpage.xy + nicoord) & uint2(1023, 511);
coords = (float2(nvicoord) + nfpart) * float(RESOLUTION_SCALE);
vicoord = uint2(floor(coords));
#endif
return LOAD_TEXTURE(samp0, int2(vicoord), 0);
#endif
}
#endif
// From https://alex.vlachos.com/graphics/Alex_Vlachos_Advanced_VR_Rendering_GDC2015.pdf
// and https://www.shadertoy.com/view/MslGR8 (5th one starting from the bottom)
// NOTE: `frag_coord` is in pixels (i.e. not normalized UV).
float3 ApplyDebanding(float2 frag_coord)
{
#if DEBANDING
// Iestyn's RGB dither (7 asm instructions) from Portal 2 X360, slightly modified for VR.
float ditherc = dot(vec2(171.0, 231.0), frag_coord);
float3 dither = float3(ditherc, ditherc, ditherc);
dither = fract(dither / float3(103.0, 71.0, 97.0));
// Subtract 0.5 to avoid slightly brightening the whole viewport.
return (dither - 0.5) / 255.0;
#else
return float3(0.0, 0.0, 0.0);
#endif
}
)";
const u32 num_fragment_outputs = use_rov ? 0 : (use_dual_source ? 2 : 1);
if (textured)
{
if (texture_filtering != GPUTextureFilter::Nearest)
WriteBatchTextureFilter(ss, texture_filtering);
if (uv_limits)
{
DeclareFragmentEntryPoint(ss, 1, 1,
{{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"},
{"nointerpolation", "float4 v_uv_limits"}},
true, num_fragment_outputs, use_dual_source, m_write_mask_as_depth, UsingMSAA(),
UsingPerSampleShading(), false, m_disable_color_perspective,
shader_blending && !use_rov, use_rov);
}
else
{
DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", palette ? "uint4 v_texpage" : "uint2 v_texpage"}}, true,
num_fragment_outputs, use_dual_source, m_write_mask_as_depth, UsingMSAA(),
UsingPerSampleShading(), false, m_disable_color_perspective,
shader_blending && !use_rov, use_rov);
}
}
else
{
DeclareFragmentEntryPoint(ss, 1, 0, {}, true, num_fragment_outputs, use_dual_source, m_write_mask_as_depth,
UsingMSAA(), UsingPerSampleShading(), false, m_disable_color_perspective,
shader_blending && !use_rov, use_rov);
}
ss << R"(
{
uint3 vertcol = uint3(v_col0.rgb * float3(255.0, 255.0, 255.0) + ApplyDebanding(v_pos.xy));
uint2 fragpos = uint2(v_pos.xy);
bool semitransparent;
uint3 icolor;
float ialpha;
float oalpha;
#if INTERLACING
if ((fragpos.y & 1u) == u_interlaced_displayed_field)
discard;
#endif
#if TEXTURED
float4 texcol;
#if TEXTURE_FILTERING
FilteredSampleFromVRAM(v_texpage, v_tex0, v_uv_limits, texcol, ialpha);
if (ialpha < 0.5)
discard;
#else
#if UV_LIMITS
texcol = SampleFromVRAM(v_texpage, clamp(v_tex0, v_uv_limits.xy, v_uv_limits.zw));
#else
texcol = SampleFromVRAM(v_texpage, v_tex0);
#endif
if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR))
discard;
ialpha = 1.0;
#endif
semitransparent = (texcol.a >= 0.5);
// If not using true color, truncate the framebuffer colors to 5-bit.
#if !TRUE_COLOR
icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3;
icolor = (icolor * vertcol) >> 4;
#if DITHERING
icolor = ApplyDithering(fragpos, icolor);
#else
icolor = min(icolor >> 3, uint3(31u, 31u, 31u));
#endif
#else
icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0) + ApplyDebanding(v_pos.xy));
icolor = (icolor * vertcol) >> 7;
#if DITHERING
icolor = ApplyDithering(fragpos, icolor);
#else
icolor = min(icolor, uint3(255u, 255u, 255u));
#endif
#endif
// Compute output alpha (mask bit)
oalpha = float(u_set_mask_while_drawing ? 1 : int(semitransparent));
#else
// All pixels are semitransparent for untextured polygons.
semitransparent = true;
icolor = vertcol;
ialpha = 1.0;
#if DITHERING
icolor = ApplyDithering(fragpos, icolor);
#else
#if !TRUE_COLOR
icolor >>= 3;
#endif
#endif
// However, the mask bit is cleared if set mask bit is false.
oalpha = float(u_set_mask_while_drawing);
#endif
#if SHADER_BLENDING
#if USE_ROV
BEGIN_ROV_REGION;
float4 bg_col = ROV_LOAD(rov_color, fragpos);
float4 o_col0;
bool discarded = false;
#if ROV_DEPTH_TEST
float bg_depth = ROV_LOAD(rov_depth, fragpos).r;
discarded = (v_pos.z > bg_depth);
#endif
#if CHECK_MASK_BIT
discarded = discarded || (bg_col.a != 0.0);
#endif
#else
float4 bg_col = LAST_FRAG_COLOR;
#if CHECK_MASK_BIT
if (bg_col.a != 0.0)
discard;
#endif
#endif
// Work in normalized space for true colour, matches HW blend.
float4 fg_col = float4(float3(icolor), oalpha);
#if TRUE_COLOR
fg_col.rgb /= 255.0;
#elif TRANSPARENCY // rgb not used in check-mask only
bg_col.rgb = roundEven(bg_col.rgb * 31.0);
#endif
#if TEXTURE_FILTERING
#if TRANSPARENCY_MODE == 0 || TRANSPARENCY_MODE == 3
bg_col.rgb /= ialpha;
#endif
fg_col.rgb *= ialpha;
#endif
o_col0.a = fg_col.a;
#if TRANSPARENCY_MODE == 0 // Half BG + Half FG.
o_col0.rgb = (bg_col.rgb * 0.5) + (fg_col.rgb * 0.5);
#elif TRANSPARENCY_MODE == 1 // BG + FG
o_col0.rgb = bg_col.rgb + fg_col.rgb;
#elif TRANSPARENCY_MODE == 2 // BG - FG
o_col0.rgb = bg_col.rgb - fg_col.rgb;
#elif TRANSPARENCY_MODE == 3 // BG + 1/4 FG.
o_col0.rgb = bg_col.rgb + (fg_col.rgb * 0.25);
#else
o_col0.rgb = fg_col.rgb;
#endif
// 16-bit truncation.
#if !TRUE_COLOR && TRANSPARENCY
o_col0.rgb = floor(o_col0.rgb);
#endif
#if TRANSPARENCY
// If pixel isn't marked as semitransparent, replace with previous colour.
o_col0 = semitransparent ? o_col0 : fg_col;
#endif
// Normalize for non-true-color.
#if !TRUE_COLOR
o_col0.rgb /= 31.0;
#endif
#if USE_ROV
if (!discarded)
{
ROV_STORE(rov_color, fragpos, o_col0);
#if USE_ROV_DEPTH
ROV_STORE(rov_depth, fragpos, float4(v_pos.z, 0.0, 0.0, 0.0));
#endif
}
END_ROV_REGION;
#endif
#else
// Premultiply alpha so we don't need to use a colour output for it.
float premultiply_alpha = ialpha;
#if TRANSPARENCY
premultiply_alpha = ialpha * (semitransparent ? u_src_alpha_factor : 1.0);
#endif
float3 color;
#if !TRUE_COLOR
// We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color
// into the blend unit, which can cause a small amount of error to accumulate.
color = floor(float3(icolor) * premultiply_alpha) / 31.0;
#else
// True color is actually simpler here since we want to preserve the precision.
color = (float3(icolor) * premultiply_alpha) / 255.0;
#endif
#if TRANSPARENCY && TEXTURED
// Apply semitransparency. If not a semitransparent texel, destination alpha is ignored.
if (semitransparent)
{
#if USE_DUAL_SOURCE
o_col0 = float4(color, oalpha);
o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha);
#else
o_col0 = float4(color, oalpha);
#endif
#if WRITE_MASK_AS_DEPTH
o_depth = oalpha * v_pos.z;
#endif
#if TRANSPARENCY_ONLY_OPAQUE
discard;
#endif
}
else
{
#if USE_DUAL_SOURCE
o_col0 = float4(color, oalpha);
o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha);
#else
o_col0 = float4(color, oalpha);
#endif
#if WRITE_MASK_AS_DEPTH
o_depth = oalpha * v_pos.z;
#endif
#if TRANSPARENCY_ONLY_TRANSPARENT
discard;
#endif
}
#elif TRANSPARENCY
// We shouldn't be rendering opaque geometry only when untextured, so no need to test/discard here.
#if USE_DUAL_SOURCE
o_col0 = float4(color, oalpha);
o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha);
#else
o_col0 = float4(color, oalpha);
#endif
#if WRITE_MASK_AS_DEPTH
o_depth = oalpha * v_pos.z;
#endif
#else
// Non-transparency won't enable blending so we can write the mask here regardless.
o_col0 = float4(color, oalpha);
#if USE_DUAL_SOURCE
o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha);
#endif
#if WRITE_MASK_AS_DEPTH
o_depth = oalpha * v_pos.z;
#endif
#endif
#endif
}
)";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateVRAMExtractFragmentShader(bool color_24bit, bool depth_buffer)
{
std::stringstream ss;
WriteHeader(ss);
DefineMacro(ss, "COLOR_24BIT", color_24bit);
DefineMacro(ss, "DEPTH_BUFFER", depth_buffer);
DefineMacro(ss, "MULTISAMPLED", UsingMSAA());
WriteCommonFunctions(ss);
DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "uint u_skip_x", "uint u_line_skip"}, true);
DeclareTexture(ss, "samp0", 0, UsingMSAA());
if (depth_buffer)
DeclareTexture(ss, "samp1", 1, UsingMSAA());
ss << R"(
float4 LoadVRAM(int2 coords)
{
#if MULTISAMPLING
float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u);
FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
value += LOAD_TEXTURE_MS(samp0, coords, sample_index);
value /= float(MULTISAMPLES);
return value;
#else
return LOAD_TEXTURE(samp0, coords, 0);
#endif
}
#if DEPTH_BUFFER
float LoadDepth(int2 coords)
{
// Need to duplicate because different types in different languages...
#if MULTISAMPLING
float value = LOAD_TEXTURE_MS(samp1, coords, 0u).r;
FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
value += LOAD_TEXTURE_MS(samp1, coords, sample_index).r;
value /= float(MULTISAMPLES);
return value;
#else
return LOAD_TEXTURE(samp1, coords, 0).r;
#endif
}
#endif
float3 SampleVRAM24(uint2 icoords)
{
// load adjacent 16-bit texels
uint2 clamp_size = uint2(1024, 512);
// relative to start of scanout
uint2 vram_coords = u_vram_offset + uint2((icoords.x * 3u) / 2u, icoords.y);
uint s0 = RGBA8ToRGBA5551(LoadVRAM(int2((vram_coords % clamp_size) * RESOLUTION_SCALE)));
uint s1 = RGBA8ToRGBA5551(LoadVRAM(int2(((vram_coords + uint2(1, 0)) % clamp_size) * RESOLUTION_SCALE)));
// select which part of the combined 16-bit texels we are currently shading
uint s1s0 = ((s1 << 16) | s0) >> ((icoords.x & 1u) * 8u);
// extract components and normalize
return float3(float(s1s0 & 0xFFu) / 255.0, float((s1s0 >> 8u) & 0xFFu) / 255.0,
float((s1s0 >> 16u) & 0xFFu) / 255.0);
}
)";
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, depth_buffer ? 2 : 1);
ss << R"(
{
uint2 icoords = uint2(uint(v_pos.x) + u_skip_x, uint(v_pos.y) << u_line_skip);
int2 wrapped_coords = int2((icoords + u_vram_offset) % VRAM_SIZE);
#if COLOR_24BIT
o_col0 = float4(SampleVRAM24(icoords), 1.0);
#else
o_col0 = float4(LoadVRAM(wrapped_coords).rgb, 1.0);
#endif
#if DEPTH_BUFFER
o_col1 = float4(LoadDepth(wrapped_coords), 0.0, 0.0, 0.0);
#endif
}
)";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateWireframeGeometryShader()
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
if (m_glsl)
{
ss << R"(
layout(triangles) in;
layout(line_strip, max_vertices = 6) out;
void main()
{
gl_Position = gl_in[0].gl_Position;
EmitVertex();
gl_Position = gl_in[1].gl_Position;
EmitVertex();
EndPrimitive();
gl_Position = gl_in[1].gl_Position;
EmitVertex();
gl_Position = gl_in[2].gl_Position;
EmitVertex();
EndPrimitive();
gl_Position = gl_in[2].gl_Position;
EmitVertex();
gl_Position = gl_in[0].gl_Position;
EmitVertex();
EndPrimitive();
}
)";
}
else
{
ss << R"(
struct GSInput
{
float4 col0 : COLOR0;
float4 pos : SV_Position;
};
struct GSOutput
{
float4 pos : SV_Position;
};
GSOutput GetVertex(GSInput vi)
{
GSOutput vo;
vo.pos = vi.pos;
return vo;
}
[maxvertexcount(6)]
void main(triangle GSInput input[3], inout LineStream<GSOutput> output)
{
output.Append(GetVertex(input[0]));
output.Append(GetVertex(input[1]));
output.RestartStrip();
output.Append(GetVertex(input[1]));
output.Append(GetVertex(input[2]));
output.RestartStrip();
output.Append(GetVertex(input[2]));
output.Append(GetVertex(input[0]));
output.RestartStrip();
}
)";
}
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateWireframeFragmentShader()
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DeclareFragmentEntryPoint(ss, 0, 0);
ss << R"(
{
o_col0 = float4(1.0, 1.0, 1.0, 0.5);
}
)";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateVRAMReadFragmentShader()
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_size"}, true);
DeclareTexture(ss, "samp0", 0, UsingMSAA());
ss << R"(
float4 LoadVRAM(int2 coords)
{
#if MULTISAMPLING
float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u);
FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++)
value += LOAD_TEXTURE_MS(samp0, coords, sample_index);
value /= float(MULTISAMPLES);
return value;
#else
return LOAD_TEXTURE(samp0, coords, 0);
#endif
}
uint SampleVRAM(uint2 coords)
{
if (RESOLUTION_SCALE == 1u)
return RGBA8ToRGBA5551(LoadVRAM(int2(coords)));
// Box filter for downsampling.
float4 value = float4(0.0, 0.0, 0.0, 0.0);
uint2 base_coords = coords * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE);
for (uint offset_x = 0u; offset_x < RESOLUTION_SCALE; offset_x++)
{
for (uint offset_y = 0u; offset_y < RESOLUTION_SCALE; offset_y++)
value += LoadVRAM(int2(base_coords + uint2(offset_x, offset_y)));
}
value /= float(RESOLUTION_SCALE * RESOLUTION_SCALE);
return RGBA8ToRGBA5551(value);
}
)";
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1);
ss << R"(
{
uint2 sample_coords = uint2(uint(v_pos.x) * 2u, uint(v_pos.y));
sample_coords += u_base_coords;
// We're encoding as 32-bit, so the output width is halved and we pack two 16-bit pixels in one 32-bit pixel.
uint left = SampleVRAM(sample_coords);
uint right = SampleVRAM(uint2(sample_coords.x + 1u, sample_coords.y));
o_col0 = float4(float(left & 0xFFu), float((left >> 8) & 0xFFu),
float(right & 0xFFu), float((right >> 8) & 0xFFu))
/ float4(255.0, 255.0, 255.0, 255.0);
})";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_buffer, bool use_ssbo)
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth);
DefineMacro(ss, "USE_BUFFER", use_buffer);
DeclareUniformBuffer(ss,
{"uint2 u_base_coords", "uint2 u_end_coords", "uint2 u_size", "uint u_buffer_base_offset",
"uint u_mask_or_bits", "float u_depth_value"},
true);
if (!use_buffer)
{
DeclareTexture(ss, "samp0", 0, false, true, true);
}
else if (use_ssbo && m_glsl)
{
ss << "layout(std430";
if (IsVulkan())
ss << ", set = 0, binding = 0";
else if (IsMetal())
ss << ", set = 0, binding = 1";
else if (m_use_glsl_binding_layout)
ss << ", binding = 0";
ss << ") readonly restrict buffer SSBO {\n";
ss << " uint ssbo_data[];\n";
ss << "};\n\n";
ss << "#define GET_VALUE(buffer_offset) (ssbo_data[(buffer_offset) / 2u] >> (((buffer_offset) % 2u) * 16u))\n\n";
}
else
{
DeclareTextureBuffer(ss, "samp0", 0, true, true);
ss << "#define GET_VALUE(buffer_offset) (LOAD_TEXTURE_BUFFER(samp0, int(buffer_offset)).r)\n\n";
}
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, m_write_mask_as_depth);
ss << R"(
{
uint2 coords = uint2(v_pos.xy) / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE);
// make sure it's not oversized and out of range
if ((coords.x < u_base_coords.x && coords.x >= u_end_coords.x) ||
(coords.y < u_base_coords.y && coords.y >= u_end_coords.y))
{
discard;
}
// find offset from the start of the row/column
uint2 offset;
offset.x = (coords.x < u_base_coords.x) ? ((VRAM_SIZE.x / RESOLUTION_SCALE) - u_base_coords.x + coords.x) : (coords.x - u_base_coords.x);
offset.y = (coords.y < u_base_coords.y) ? ((VRAM_SIZE.y / RESOLUTION_SCALE) - u_base_coords.y + coords.y) : (coords.y - u_base_coords.y);
#if !USE_BUFFER
uint value = LOAD_TEXTURE(samp0, int2(offset), 0).x;
#else
uint buffer_offset = u_buffer_base_offset + (offset.y * u_size.x) + offset.x;
uint value = GET_VALUE(buffer_offset) | u_mask_or_bits;
#endif
o_col0 = RGBA5551ToRGBA8(value);
#if WRITE_MASK_AS_DEPTH
o_depth = (o_col0.a == 1.0) ? u_depth_value : 0.0;
#endif
})";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader()
{
// TODO: This won't currently work because we can't bind the texture to both the shader and framebuffer.
const bool msaa = false;
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth);
DeclareUniformBuffer(ss,
{"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_end_coords", "uint2 u_size",
"bool u_set_mask_bit", "float u_depth_value"},
true);
DeclareTexture(ss, "samp0", 0, msaa);
DefineMacro(ss, "MSAA_COPY", msaa);
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, m_write_mask_as_depth, false, false, msaa);
ss << R"(
{
uint2 dst_coords = uint2(v_pos.xy);
// make sure it's not oversized and out of range
if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) ||
(dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y))
{
discard;
}
// find offset from the start of the row/column
uint2 offset;
offset.x = (dst_coords.x < u_dst_coords.x) ? (VRAM_SIZE.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x);
offset.y = (dst_coords.y < u_dst_coords.y) ? (VRAM_SIZE.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y);
// find the source coordinates to copy from
uint2 src_coords = (u_src_coords + offset) % VRAM_SIZE;
// sample and apply mask bit
#if MSAA_COPY
float4 color = LOAD_TEXTURE_MS(samp0, int2(src_coords), f_sample_index);
#else
float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0);
#endif
o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a);
#if WRITE_MASK_AS_DEPTH
o_depth = (u_set_mask_bit ? 1.0f : ((o_col0.a == 1.0) ? u_depth_value : 0.0));
#endif
})";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateVRAMFillFragmentShader(bool wrapped, bool interlaced)
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth);
DefineMacro(ss, "WRAPPED", wrapped);
DefineMacro(ss, "INTERLACED", interlaced);
DeclareUniformBuffer(
ss, {"uint2 u_dst_coords", "uint2 u_end_coords", "float4 u_fill_color", "uint u_interlaced_displayed_field"}, true);
DeclareFragmentEntryPoint(ss, 0, 1, {}, interlaced || wrapped, 1, false, m_write_mask_as_depth, false, false, false);
ss << R"(
{
#if INTERLACED || WRAPPED
uint2 dst_coords = uint2(v_pos.xy);
#endif
#if INTERLACED
if ((dst_coords.y & 1u) == u_interlaced_displayed_field)
discard;
#endif
#if WRAPPED
// make sure it's not oversized and out of range
if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) ||
(dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y))
{
discard;
}
#endif
o_col0 = u_fill_color;
#if WRITE_MASK_AS_DEPTH
o_depth = u_fill_color.a;
#endif
})";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateVRAMUpdateDepthFragmentShader()
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DeclareTexture(ss, "samp0", 0, UsingMSAA());
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 0, false, true, false, false, UsingMSAA());
ss << R"(
{
#if MULTISAMPLING
o_depth = LOAD_TEXTURE_MS(samp0, int2(v_pos.xy), f_sample_index).a;
#else
o_depth = LOAD_TEXTURE(samp0, int2(v_pos.xy), 0).a;
#endif
}
)";
return ss.str();
}
void GPU_HW_ShaderGen::WriteAdaptiveDownsampleUniformBuffer(std::stringstream& ss)
{
DeclareUniformBuffer(ss, {"float2 u_uv_min", "float2 u_uv_max", "float2 u_rcp_resolution", "float u_lod"}, true);
}
std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleVertexShader()
{
std::stringstream ss;
WriteHeader(ss);
WriteAdaptiveDownsampleUniformBuffer(ss);
DeclareVertexEntryPoint(ss, {}, 0, 1, {}, true);
ss << R"(
{
v_tex0 = float2(float((v_id << 1) & 2u), float(v_id & 2u));
v_pos = float4(v_tex0 * float2(2.0f, -2.0f) + float2(-1.0f, 1.0f), 0.0f, 1.0f);
v_tex0 = u_uv_min + (u_uv_max - u_uv_min) * v_tex0;
#if API_OPENGL || API_OPENGL_ES || API_VULKAN
v_pos.y = -v_pos.y;
#endif
}
)";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleMipFragmentShader(bool first_pass)
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
WriteAdaptiveDownsampleUniformBuffer(ss);
DeclareTexture(ss, "samp0", 0, false);
DefineMacro(ss, "FIRST_PASS", first_pass);
// mipmap_energy.glsl ported from parallel-rsx.
ss << R"(
float4 get_bias(float3 c00, float3 c01, float3 c10, float3 c11)
{
// Measure the "energy" (variance) in the pixels.
// If the pixels are all the same (2D content), use maximum bias, otherwise, taper off quickly back to 0 (edges)
float3 avg = 0.25 * (c00 + c01 + c10 + c11);
float s00 = dot(c00 - avg, c00 - avg);
float s01 = dot(c01 - avg, c01 - avg);
float s10 = dot(c10 - avg, c10 - avg);
float s11 = dot(c11 - avg, c11 - avg);
return float4(avg, 1.0 - log2(1000.0 * (s00 + s01 + s10 + s11) + 1.0));
}
float4 get_bias(float4 c00, float4 c01, float4 c10, float4 c11)
{
// Measure the "energy" (variance) in the pixels.
// If the pixels are all the same (2D content), use maximum bias, otherwise, taper off quickly back to 0 (edges)
float avg = 0.25 * (c00.a + c01.a + c10.a + c11.a);
float4 bias = get_bias(c00.rgb, c01.rgb, c10.rgb, c11.rgb);
bias.a *= avg;
return bias;
}
)";
DeclareFragmentEntryPoint(ss, 0, 1);
ss << R"(
{
float2 uv = v_tex0 - (u_rcp_resolution * 0.25);
#ifdef FIRST_PASS
vec3 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0)).rgb;
vec3 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1)).rgb;
vec3 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0)).rgb;
vec3 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1)).rgb;
o_col0 = get_bias(c00, c01, c10, c11);
#else
vec4 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0));
vec4 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1));
vec4 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0));
vec4 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1));
o_col0 = get_bias(c00, c01, c10, c11);
#endif
}
)";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleBlurFragmentShader()
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
WriteAdaptiveDownsampleUniformBuffer(ss);
DeclareTexture(ss, "samp0", 0, false);
// mipmap_blur.glsl ported from parallel-rsx.
DeclareFragmentEntryPoint(ss, 0, 1);
ss << R"(
{
float bias = 0.0;
const float w0 = 0.25;
const float w1 = 0.125;
const float w2 = 0.0625;
#define UV(x, y) clamp((v_tex0 + float2(x, y) * u_rcp_resolution), u_uv_min, u_uv_max)
bias += w2 * SAMPLE_TEXTURE(samp0, UV(-1.0, -1.0)).a;
bias += w2 * SAMPLE_TEXTURE(samp0, UV(+1.0, -1.0)).a;
bias += w2 * SAMPLE_TEXTURE(samp0, UV(-1.0, +1.0)).a;
bias += w2 * SAMPLE_TEXTURE(samp0, UV(+1.0, +1.0)).a;
bias += w1 * SAMPLE_TEXTURE(samp0, UV( 0.0, -1.0)).a;
bias += w1 * SAMPLE_TEXTURE(samp0, UV(-1.0, 0.0)).a;
bias += w1 * SAMPLE_TEXTURE(samp0, UV(+1.0, 0.0)).a;
bias += w1 * SAMPLE_TEXTURE(samp0, UV( 0.0, +1.0)).a;
bias += w0 * SAMPLE_TEXTURE(samp0, UV( 0.0, 0.0)).a;
o_col0 = float4(bias, bias, bias, bias);
}
)";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleCompositeFragmentShader()
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DeclareTexture(ss, "samp0", 0, false);
DeclareTexture(ss, "samp1", 1, false);
// mipmap_resolve.glsl ported from parallel-rsx.
DeclareFragmentEntryPoint(ss, 0, 1, {}, true);
ss << R"(
{
float bias = SAMPLE_TEXTURE(samp1, v_tex0).r;
float mip = float(RESOLUTION_SCALE - 1u) * bias;
float3 color = SAMPLE_TEXTURE_LEVEL(samp0, v_tex0, mip).rgb;
o_col0 = float4(color, 1.0);
}
)";
return ss.str();
}
std::string GPU_HW_ShaderGen::GenerateBoxSampleDownsampleFragmentShader(u32 factor)
{
std::stringstream ss;
WriteHeader(ss);
WriteCommonFunctions(ss);
DeclareUniformBuffer(ss, {"uint2 u_base_coords"}, true);
DeclareTexture(ss, "samp0", 0, false);
ss << "#define FACTOR " << factor << "\n";
DeclareFragmentEntryPoint(ss, 0, 1, {}, true);
ss << R"(
{
float3 color = float3(0.0, 0.0, 0.0);
uint2 base_coords = u_base_coords + uint2(v_pos.xy) * uint2(FACTOR, FACTOR);
for (uint offset_x = 0u; offset_x < FACTOR; offset_x++)
{
for (uint offset_y = 0u; offset_y < FACTOR; offset_y++)
color += LOAD_TEXTURE(samp0, int2(base_coords + uint2(offset_x, offset_y)), 0).rgb;
}
color /= float(FACTOR * FACTOR);
o_col0 = float4(color, 1.0);
}
)";
return ss.str();
}