// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "gpu_hw_shadergen.h" #include "common/assert.h" #include GPU_HW_ShaderGen::GPU_HW_ShaderGen(RenderAPI render_api, u32 resolution_scale, u32 multisamples, bool per_sample_shading, bool true_color, bool scaled_dithering, GPUTextureFilter texture_filtering, bool uv_limits, bool write_mask_as_depth, bool disable_color_perspective, bool supports_dual_source_blend, bool supports_framebuffer_fetch, bool debanding) : ShaderGen(render_api, supports_dual_source_blend, supports_framebuffer_fetch), m_resolution_scale(resolution_scale), m_multisamples(multisamples), m_per_sample_shading(per_sample_shading), m_true_color(true_color), m_scaled_dithering(scaled_dithering), m_texture_filter(texture_filtering), m_uv_limits(uv_limits), m_write_mask_as_depth(write_mask_as_depth), m_disable_color_perspective(disable_color_perspective), m_debanding(debanding) { } GPU_HW_ShaderGen::~GPU_HW_ShaderGen() = default; void GPU_HW_ShaderGen::WriteCommonFunctions(std::stringstream& ss) { DefineMacro(ss, "MULTISAMPLING", UsingMSAA()); ss << "CONSTANT uint RESOLUTION_SCALE = " << m_resolution_scale << "u;\n"; ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n"; ss << "CONSTANT uint MULTISAMPLES = " << m_multisamples << "u;\n"; ss << "CONSTANT bool PER_SAMPLE_SHADING = " << (m_per_sample_shading ? "true" : "false") << ";\n"; ss << R"( uint RGBA8ToRGBA5551(float4 v) { uint r = uint(roundEven(v.r * 31.0)); uint g = uint(roundEven(v.g * 31.0)); uint b = uint(roundEven(v.b * 31.0)); uint a = (v.a != 0.0) ? 1u : 0u; return (r) | (g << 5) | (b << 10) | (a << 15); } float4 RGBA5551ToRGBA8(uint v) { uint r = (v & 31u); uint g = ((v >> 5) & 31u); uint b = ((v >> 10) & 31u); uint a = ((v >> 15) & 1u); return float4(float(r) / 31.0, float(g) / 31.0, float(b) / 31.0, float(a)); } )"; } void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss) { DeclareUniformBuffer(ss, {"uint2 u_texture_window_and", "uint2 u_texture_window_or", "float u_src_alpha_factor", "float u_dst_alpha_factor", "uint u_interlaced_displayed_field", "bool u_set_mask_while_drawing"}, false); } std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured, bool pgxp_depth) { std::stringstream ss; WriteHeader(ss); DefineMacro(ss, "TEXTURED", textured); DefineMacro(ss, "UV_LIMITS", m_uv_limits); DefineMacro(ss, "PGXP_DEPTH", pgxp_depth); WriteCommonFunctions(ss); WriteBatchUniformBuffer(ss); if (textured) { if (m_uv_limits) { DeclareVertexEntryPoint( ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage", "float4 a_uv_limits"}, 1, 1, {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, false, "", UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective); } else { DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1, {{"nointerpolation", "uint4 v_texpage"}}, false, "", UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective); } } else { DeclareVertexEntryPoint(ss, {"float4 a_pos", "float4 a_col0"}, 1, 0, {}, false, "", UsingMSAA(), UsingPerSampleShading(), m_disable_color_perspective); } ss << R"( { // Offset the vertex position by 0.5 to ensure correct interpolation of texture coordinates // at 1x resolution scale. This doesn't work at >1x, we adjust the texture coordinates before // uploading there instead. float vertex_offset = (RESOLUTION_SCALE == 1u) ? 0.5 : 0.0; // 0..+1023 -> -1..1 float pos_x = ((a_pos.x + vertex_offset) / 512.0) - 1.0; float pos_y = ((a_pos.y + vertex_offset) / -256.0) + 1.0; #if PGXP_DEPTH // Ignore mask Z when using PGXP depth. float pos_z = a_pos.w; float pos_w = a_pos.w; #else float pos_z = a_pos.z; float pos_w = a_pos.w; #endif #if API_OPENGL || API_OPENGL_ES // 0..1 to -1..1 depth range. pos_z = (pos_z * 2.0) - 1.0; #endif // NDC space Y flip in Vulkan. #if API_OPENGL || API_OPENGL_ES || API_VULKAN pos_y = -pos_y; #endif v_pos = float4(pos_x * pos_w, pos_y * pos_w, pos_z * pos_w, pos_w); v_col0 = a_col0; #if TEXTURED v_tex0 = float2(float((a_texcoord & 0xFFFFu) * RESOLUTION_SCALE), float((a_texcoord >> 16) * RESOLUTION_SCALE)); // base_x,base_y,palette_x,palette_y // Palette X is scaled in fragment shader, since it can wrap. v_texpage.x = (a_texpage & 15u) * 64u * RESOLUTION_SCALE; v_texpage.y = ((a_texpage >> 4) & 1u) * 256u * RESOLUTION_SCALE; v_texpage.z = ((a_texpage >> 16) & 63u) * 16u; v_texpage.w = ((a_texpage >> 22) & 511u) * RESOLUTION_SCALE; #if UV_LIMITS v_uv_limits = a_uv_limits * float4(255.0, 255.0, 255.0, 255.0); #endif #endif } )"; return ss.str(); } void GPU_HW_ShaderGen::WriteBatchTextureFilter(std::stringstream& ss, GPUTextureFilter texture_filter) { // JINC2 and xBRZ shaders originally from beetle-psx, modified to support filtering mask channel. if (texture_filter == GPUTextureFilter::Bilinear || texture_filter == GPUTextureFilter::BilinearBinAlpha) { DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::BilinearBinAlpha); ss << R"( void FilteredSampleFromVRAM(uint4 texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha) { // Compute the coordinates of the four texels we will be interpolating between. // Clamp this to the triangle texture coordinates. float2 texel_top_left = frac(coords) - float2(0.5, 0.5); float2 texel_offset = sign(texel_top_left); float4 fcoords = max(coords.xyxy + float4(0.0, 0.0, texel_offset.x, texel_offset.y), float4(0.0, 0.0, 0.0, 0.0)); // Load four texels. float4 s00 = SampleFromVRAM(texpage, clamp(fcoords.xy, uv_limits.xy, uv_limits.zw)); float4 s10 = SampleFromVRAM(texpage, clamp(fcoords.zy, uv_limits.xy, uv_limits.zw)); float4 s01 = SampleFromVRAM(texpage, clamp(fcoords.xw, uv_limits.xy, uv_limits.zw)); float4 s11 = SampleFromVRAM(texpage, clamp(fcoords.zw, uv_limits.xy, uv_limits.zw)); // Compute alpha from how many texels aren't pixel color 0000h. float a00 = float(VECTOR_NEQ(s00, TRANSPARENT_PIXEL_COLOR)); float a10 = float(VECTOR_NEQ(s10, TRANSPARENT_PIXEL_COLOR)); float a01 = float(VECTOR_NEQ(s01, TRANSPARENT_PIXEL_COLOR)); float a11 = float(VECTOR_NEQ(s11, TRANSPARENT_PIXEL_COLOR)); // Bilinearly interpolate. float2 weights = abs(texel_top_left); texcol = lerp(lerp(s00, s10, weights.x), lerp(s01, s11, weights.x), weights.y); ialpha = lerp(lerp(a00, a10, weights.x), lerp(a01, a11, weights.x), weights.y); // Compensate for partially transparent sampling. if (ialpha > 0.0) texcol.rgb /= float3(ialpha, ialpha, ialpha); #if BINALPHA ialpha = (ialpha >= 0.5) ? 1.0 : 0.0; #endif } )"; } else if (texture_filter == GPUTextureFilter::JINC2 || texture_filter == GPUTextureFilter::JINC2BinAlpha) { DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::JINC2BinAlpha); ss << R"( CONSTANT float JINC2_WINDOW_SINC = 0.44; CONSTANT float JINC2_SINC = 0.82; CONSTANT float JINC2_AR_STRENGTH = 0.8; CONSTANT float halfpi = 1.5707963267948966192313216916398; CONSTANT float pi = 3.1415926535897932384626433832795; CONSTANT float wa = 1.382300768; CONSTANT float wb = 2.576105976; // Calculates the distance between two points float d(float2 pt1, float2 pt2) { float2 v = pt2 - pt1; return sqrt(dot(v,v)); } float min4(float a, float b, float c, float d) { return min(a, min(b, min(c, d))); } float4 min4(float4 a, float4 b, float4 c, float4 d) { return min(a, min(b, min(c, d))); } float max4(float a, float b, float c, float d) { return max(a, max(b, max(c, d))); } float4 max4(float4 a, float4 b, float4 c, float4 d) { return max(a, max(b, max(c, d))); } float4 resampler(float4 x) { float4 res; // res = (x==float4(0.0, 0.0, 0.0, 0.0)) ? float4(wa*wb) : sin(x*wa)*sin(x*wb)/(x*x); // Need to use mix(.., equal(..)) since we want zero check to be component wise res = lerp(sin(x*wa)*sin(x*wb)/(x*x), float4(wa*wb, wa*wb, wa*wb, wa*wb), VECTOR_COMP_EQ(x,float4(0.0, 0.0, 0.0, 0.0))); return res; } void FilteredSampleFromVRAM(uint4 texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha) { float4 weights[4]; float2 dx = float2(1.0, 0.0); float2 dy = float2(0.0, 1.0); float2 pc = coords.xy; float2 tc = (floor(pc-float2(0.5,0.5))+float2(0.5,0.5)); weights[0] = resampler(float4(d(pc, tc -dx -dy), d(pc, tc -dy), d(pc, tc +dx -dy), d(pc, tc+2.0*dx -dy))); weights[1] = resampler(float4(d(pc, tc -dx ), d(pc, tc ), d(pc, tc +dx ), d(pc, tc+2.0*dx ))); weights[2] = resampler(float4(d(pc, tc -dx +dy), d(pc, tc +dy), d(pc, tc +dx +dy), d(pc, tc+2.0*dx +dy))); weights[3] = resampler(float4(d(pc, tc -dx+2.0*dy), d(pc, tc +2.0*dy), d(pc, tc +dx+2.0*dy), d(pc, tc+2.0*dx+2.0*dy))); dx = dx; dy = dy; tc = tc; #define sample_texel(coords) SampleFromVRAM(texpage, clamp((coords), uv_limits.xy, uv_limits.zw)) float4 c00 = sample_texel(tc -dx -dy); float a00 = float(VECTOR_NEQ(c00, TRANSPARENT_PIXEL_COLOR)); float4 c10 = sample_texel(tc -dy); float a10 = float(VECTOR_NEQ(c10, TRANSPARENT_PIXEL_COLOR)); float4 c20 = sample_texel(tc +dx -dy); float a20 = float(VECTOR_NEQ(c20, TRANSPARENT_PIXEL_COLOR)); float4 c30 = sample_texel(tc+2.0*dx -dy); float a30 = float(VECTOR_NEQ(c30, TRANSPARENT_PIXEL_COLOR)); float4 c01 = sample_texel(tc -dx ); float a01 = float(VECTOR_NEQ(c01, TRANSPARENT_PIXEL_COLOR)); float4 c11 = sample_texel(tc ); float a11 = float(VECTOR_NEQ(c11, TRANSPARENT_PIXEL_COLOR)); float4 c21 = sample_texel(tc +dx ); float a21 = float(VECTOR_NEQ(c21, TRANSPARENT_PIXEL_COLOR)); float4 c31 = sample_texel(tc+2.0*dx ); float a31 = float(VECTOR_NEQ(c31, TRANSPARENT_PIXEL_COLOR)); float4 c02 = sample_texel(tc -dx +dy); float a02 = float(VECTOR_NEQ(c02, TRANSPARENT_PIXEL_COLOR)); float4 c12 = sample_texel(tc +dy); float a12 = float(VECTOR_NEQ(c12, TRANSPARENT_PIXEL_COLOR)); float4 c22 = sample_texel(tc +dx +dy); float a22 = float(VECTOR_NEQ(c22, TRANSPARENT_PIXEL_COLOR)); float4 c32 = sample_texel(tc+2.0*dx +dy); float a32 = float(VECTOR_NEQ(c32, TRANSPARENT_PIXEL_COLOR)); float4 c03 = sample_texel(tc -dx+2.0*dy); float a03 = float(VECTOR_NEQ(c03, TRANSPARENT_PIXEL_COLOR)); float4 c13 = sample_texel(tc +2.0*dy); float a13 = float(VECTOR_NEQ(c13, TRANSPARENT_PIXEL_COLOR)); float4 c23 = sample_texel(tc +dx+2.0*dy); float a23 = float(VECTOR_NEQ(c23, TRANSPARENT_PIXEL_COLOR)); float4 c33 = sample_texel(tc+2.0*dx+2.0*dy); float a33 = float(VECTOR_NEQ(c33, TRANSPARENT_PIXEL_COLOR)); #undef sample_texel // Get min/max samples float4 min_sample = min4(c11, c21, c12, c22); float min_sample_alpha = min4(a11, a21, a12, a22); float4 max_sample = max4(c11, c21, c12, c22); float max_sample_alpha = max4(a11, a21, a12, a22); float4 color; color = float4(dot(weights[0], float4(c00.x, c10.x, c20.x, c30.x)), dot(weights[0], float4(c00.y, c10.y, c20.y, c30.y)), dot(weights[0], float4(c00.z, c10.z, c20.z, c30.z)), dot(weights[0], float4(c00.w, c10.w, c20.w, c30.w))); color+= float4(dot(weights[1], float4(c01.x, c11.x, c21.x, c31.x)), dot(weights[1], float4(c01.y, c11.y, c21.y, c31.y)), dot(weights[1], float4(c01.z, c11.z, c21.z, c31.z)), dot(weights[1], float4(c01.w, c11.w, c21.w, c31.w))); color+= float4(dot(weights[2], float4(c02.x, c12.x, c22.x, c32.x)), dot(weights[2], float4(c02.y, c12.y, c22.y, c32.y)), dot(weights[2], float4(c02.z, c12.z, c22.z, c32.z)), dot(weights[2], float4(c02.w, c12.w, c22.w, c32.w))); color+= float4(dot(weights[3], float4(c03.x, c13.x, c23.x, c33.x)), dot(weights[3], float4(c03.y, c13.y, c23.y, c33.y)), dot(weights[3], float4(c03.z, c13.z, c23.z, c33.z)), dot(weights[3], float4(c03.w, c13.w, c23.w, c33.w))); color = color/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1))); float alpha; alpha = dot(weights[0], float4(a00, a10, a20, a30)); alpha+= dot(weights[1], float4(a01, a11, a21, a31)); alpha+= dot(weights[2], float4(a02, a12, a22, a32)); alpha+= dot(weights[3], float4(a03, a13, a23, a33)); //alpha = alpha/(weights[0].w + weights[1].w + weights[2].w + weights[3].w); alpha = alpha/(dot(weights[0], float4(1,1,1,1)) + dot(weights[1], float4(1,1,1,1)) + dot(weights[2], float4(1,1,1,1)) + dot(weights[3], float4(1,1,1,1))); // Anti-ringing float4 aux = color; float aux_alpha = alpha; color = clamp(color, min_sample, max_sample); alpha = clamp(alpha, min_sample_alpha, max_sample_alpha); color = lerp(aux, color, JINC2_AR_STRENGTH); alpha = lerp(aux_alpha, alpha, JINC2_AR_STRENGTH); // final sum and weight normalization ialpha = alpha; texcol = color; // Compensate for partially transparent sampling. if (ialpha > 0.0) texcol.rgb /= float3(ialpha, ialpha, ialpha); #if BINALPHA ialpha = (ialpha >= 0.5) ? 1.0 : 0.0; #endif } )"; } else if (texture_filter == GPUTextureFilter::xBR || texture_filter == GPUTextureFilter::xBRBinAlpha) { DefineMacro(ss, "BINALPHA", texture_filter == GPUTextureFilter::xBRBinAlpha); ss << R"( CONSTANT int BLEND_NONE = 0; CONSTANT int BLEND_NORMAL = 1; CONSTANT int BLEND_DOMINANT = 2; CONSTANT float LUMINANCE_WEIGHT = 1.0; CONSTANT float EQUAL_COLOR_TOLERANCE = 0.1176470588235294; CONSTANT float STEEP_DIRECTION_THRESHOLD = 2.2; CONSTANT float DOMINANT_DIRECTION_THRESHOLD = 3.6; CONSTANT float4 w = float4(0.2627, 0.6780, 0.0593, 0.5); float DistYCbCr(float4 pixA, float4 pixB) { const float scaleB = 0.5 / (1.0 - w.b); const float scaleR = 0.5 / (1.0 - w.r); float4 diff = pixA - pixB; float Y = dot(diff, w); float Cb = scaleB * (diff.b - Y); float Cr = scaleR * (diff.r - Y); return sqrt(((LUMINANCE_WEIGHT * Y) * (LUMINANCE_WEIGHT * Y)) + (Cb * Cb) + (Cr * Cr)); } bool IsPixEqual(const float4 pixA, const float4 pixB) { return (DistYCbCr(pixA, pixB) < EQUAL_COLOR_TOLERANCE); } float get_left_ratio(float2 center, float2 origin, float2 direction, float2 scale) { float2 P0 = center - origin; float2 proj = direction * (dot(P0, direction) / dot(direction, direction)); float2 distv = P0 - proj; float2 orth = float2(-direction.y, direction.x); float side = sign(dot(P0, orth)); float v = side * length(distv * scale); // return step(0, v); return smoothstep(-sqrt(2.0)/2.0, sqrt(2.0)/2.0, v); } #define P(coord, xoffs, yoffs) SampleFromVRAM(texpage, clamp(coords + float2((xoffs), (yoffs)), uv_limits.xy, uv_limits.zw)) void FilteredSampleFromVRAM(uint4 texpage, float2 coords, float4 uv_limits, out float4 texcol, out float ialpha) { //--------------------------------------- // Input Pixel Mapping: -|x|x|x|- // x|A|B|C|x // x|D|E|F|x // x|G|H|I|x // -|x|x|x|- float2 scale = float2(8.0, 8.0); float2 pos = frac(coords.xy) - float2(0.5, 0.5); float2 coord = coords.xy - pos; float4 A = P(coord, -1,-1); float Aw = A.w; A.w = float(VECTOR_NEQ(A, TRANSPARENT_PIXEL_COLOR)); float4 B = P(coord, 0,-1); float Bw = B.w; B.w = float(VECTOR_NEQ(B, TRANSPARENT_PIXEL_COLOR)); float4 C = P(coord, 1,-1); float Cw = C.w; C.w = float(VECTOR_NEQ(C, TRANSPARENT_PIXEL_COLOR)); float4 D = P(coord, -1, 0); float Dw = D.w; D.w = float(VECTOR_NEQ(D, TRANSPARENT_PIXEL_COLOR)); float4 E = P(coord, 0, 0); float Ew = E.w; E.w = float(VECTOR_NEQ(E, TRANSPARENT_PIXEL_COLOR)); float4 F = P(coord, 1, 0); float Fw = F.w; F.w = float(VECTOR_NEQ(F, TRANSPARENT_PIXEL_COLOR)); float4 G = P(coord, -1, 1); float Gw = G.w; G.w = float(VECTOR_NEQ(G, TRANSPARENT_PIXEL_COLOR)); float4 H = P(coord, 0, 1); float Hw = H.w; H.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR)); float4 I = P(coord, 1, 1); float Iw = I.w; I.w = float(VECTOR_NEQ(H, TRANSPARENT_PIXEL_COLOR)); // blendResult Mapping: x|y| // w|z| int4 blendResult = int4(BLEND_NONE,BLEND_NONE,BLEND_NONE,BLEND_NONE); // Preprocess corners // Pixel Tap Mapping: -|-|-|-|- // -|-|B|C|- // -|D|E|F|x // -|G|H|I|x // -|-|x|x|- if (!((VECTOR_EQ(E,F) && VECTOR_EQ(H,I)) || (VECTOR_EQ(E,H) && VECTOR_EQ(F,I)))) { float dist_H_F = DistYCbCr(G, E) + DistYCbCr(E, C) + DistYCbCr(P(coord, 0,2), I) + DistYCbCr(I, P(coord, 2,0)) + (4.0 * DistYCbCr(H, F)); float dist_E_I = DistYCbCr(D, H) + DistYCbCr(H, P(coord, 1,2)) + DistYCbCr(B, F) + DistYCbCr(F, P(coord, 2,1)) + (4.0 * DistYCbCr(E, I)); bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_H_F) < dist_E_I; blendResult.z = ((dist_H_F < dist_E_I) && VECTOR_NEQ(E,F) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE; } // Pixel Tap Mapping: -|-|-|-|- // -|A|B|-|- // x|D|E|F|- // x|G|H|I|- // -|x|x|-|- if (!((VECTOR_EQ(D,E) && VECTOR_EQ(G,H)) || (VECTOR_EQ(D,G) && VECTOR_EQ(E,H)))) { float dist_G_E = DistYCbCr(P(coord, -2,1) , D) + DistYCbCr(D, B) + DistYCbCr(P(coord, -1,2), H) + DistYCbCr(H, F) + (4.0 * DistYCbCr(G, E)); float dist_D_H = DistYCbCr(P(coord, -2,0) , G) + DistYCbCr(G, P(coord, 0,2)) + DistYCbCr(A, E) + DistYCbCr(E, I) + (4.0 * DistYCbCr(D, H)); bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_H) < dist_G_E; blendResult.w = ((dist_G_E > dist_D_H) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,H)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE; } // Pixel Tap Mapping: -|-|x|x|- // -|A|B|C|x // -|D|E|F|x // -|-|H|I|- // -|-|-|-|- if (!((VECTOR_EQ(B,C) && VECTOR_EQ(E,F)) || (VECTOR_EQ(B,E) && VECTOR_EQ(C,F)))) { float dist_E_C = DistYCbCr(D, B) + DistYCbCr(B, P(coord, 1,-2)) + DistYCbCr(H, F) + DistYCbCr(F, P(coord, 2,-1)) + (4.0 * DistYCbCr(E, C)); float dist_B_F = DistYCbCr(A, E) + DistYCbCr(E, I) + DistYCbCr(P(coord, 0,-2), C) + DistYCbCr(C, P(coord, 2,0)) + (4.0 * DistYCbCr(B, F)); bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_B_F) < dist_E_C; blendResult.y = ((dist_E_C > dist_B_F) && VECTOR_NEQ(E,B) && VECTOR_NEQ(E,F)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE; } // Pixel Tap Mapping: -|x|x|-|- // x|A|B|C|- // x|D|E|F|- // -|G|H|-|- // -|-|-|-|- if (!((VECTOR_EQ(A,B) && VECTOR_EQ(D,E)) || (VECTOR_EQ(A,D) && VECTOR_EQ(B,E)))) { float dist_D_B = DistYCbCr(P(coord, -2,0), A) + DistYCbCr(A, P(coord, 0,-2)) + DistYCbCr(G, E) + DistYCbCr(E, C) + (4.0 * DistYCbCr(D, B)); float dist_A_E = DistYCbCr(P(coord, -2,-1), D) + DistYCbCr(D, H) + DistYCbCr(P(coord, -1,-2), B) + DistYCbCr(B, F) + (4.0 * DistYCbCr(A, E)); bool dominantGradient = (DOMINANT_DIRECTION_THRESHOLD * dist_D_B) < dist_A_E; blendResult.x = ((dist_D_B < dist_A_E) && VECTOR_NEQ(E,D) && VECTOR_NEQ(E,B)) ? ((dominantGradient) ? BLEND_DOMINANT : BLEND_NORMAL) : BLEND_NONE; } float4 res = E; float resW = Ew; // Pixel Tap Mapping: -|-|-|-|- // -|-|B|C|- // -|D|E|F|x // -|G|H|I|x // -|-|x|x|- if(blendResult.z != BLEND_NONE) { float dist_F_G = DistYCbCr(F, G); float dist_H_C = DistYCbCr(H, C); bool doLineBlend = (blendResult.z == BLEND_DOMINANT || !((blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) || (blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) || (IsPixEqual(G, H) && IsPixEqual(H, I) && IsPixEqual(I, F) && IsPixEqual(F, C) && !IsPixEqual(E, I)))); float2 origin = float2(0.0, 1.0 / sqrt(2.0)); float2 direction = float2(1.0, -1.0); if(doLineBlend) { bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_F_G <= dist_H_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(D,G); bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_H_C <= dist_F_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(B,C); origin = haveShallowLine? float2(0.0, 0.25) : float2(0.0, 0.5); direction.x += haveShallowLine? 1.0: 0.0; direction.y -= haveSteepLine? 1.0: 0.0; } float4 blendPix = lerp(H,F, step(DistYCbCr(E, F), DistYCbCr(E, H))); float blendW = lerp(Hw,Fw, step(DistYCbCr(E, F), DistYCbCr(E, H))); res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale)); resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale)); } // Pixel Tap Mapping: -|-|-|-|- // -|A|B|-|- // x|D|E|F|- // x|G|H|I|- // -|x|x|-|- if(blendResult.w != BLEND_NONE) { float dist_H_A = DistYCbCr(H, A); float dist_D_I = DistYCbCr(D, I); bool doLineBlend = (blendResult.w == BLEND_DOMINANT || !((blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) || (blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) || (IsPixEqual(A, D) && IsPixEqual(D, G) && IsPixEqual(G, H) && IsPixEqual(H, I) && !IsPixEqual(E, G)))); float2 origin = float2(-1.0 / sqrt(2.0), 0.0); float2 direction = float2(1.0, 1.0); if(doLineBlend) { bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_H_A <= dist_D_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(B,A); bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_D_I <= dist_H_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(F,I); origin = haveShallowLine? float2(-0.25, 0.0) : float2(-0.5, 0.0); direction.y += haveShallowLine? 1.0: 0.0; direction.x += haveSteepLine? 1.0: 0.0; } origin = origin; direction = direction; float4 blendPix = lerp(H,D, step(DistYCbCr(E, D), DistYCbCr(E, H))); float blendW = lerp(Hw,Dw, step(DistYCbCr(E, D), DistYCbCr(E, H))); res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale)); resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale)); } // Pixel Tap Mapping: -|-|x|x|- // -|A|B|C|x // -|D|E|F|x // -|-|H|I|- // -|-|-|-|- if(blendResult.y != BLEND_NONE) { float dist_B_I = DistYCbCr(B, I); float dist_F_A = DistYCbCr(F, A); bool doLineBlend = (blendResult.y == BLEND_DOMINANT || !((blendResult.x != BLEND_NONE && !IsPixEqual(E, I)) || (blendResult.z != BLEND_NONE && !IsPixEqual(E, A)) || (IsPixEqual(I, F) && IsPixEqual(F, C) && IsPixEqual(C, B) && IsPixEqual(B, A) && !IsPixEqual(E, C)))); float2 origin = float2(1.0 / sqrt(2.0), 0.0); float2 direction = float2(-1.0, -1.0); if(doLineBlend) { bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_B_I <= dist_F_A) && VECTOR_NEQ(E,I) && VECTOR_NEQ(H,I); bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_F_A <= dist_B_I) && VECTOR_NEQ(E,A) && VECTOR_NEQ(D,A); origin = haveShallowLine? float2(0.25, 0.0) : float2(0.5, 0.0); direction.y -= haveShallowLine? 1.0: 0.0; direction.x -= haveSteepLine? 1.0: 0.0; } float4 blendPix = lerp(F,B, step(DistYCbCr(E, B), DistYCbCr(E, F))); float blendW = lerp(Fw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, F))); res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale)); resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale)); } // Pixel Tap Mapping: -|x|x|-|- // x|A|B|C|- // x|D|E|F|- // -|G|H|-|- // -|-|-|-|- if(blendResult.x != BLEND_NONE) { float dist_D_C = DistYCbCr(D, C); float dist_B_G = DistYCbCr(B, G); bool doLineBlend = (blendResult.x == BLEND_DOMINANT || !((blendResult.w != BLEND_NONE && !IsPixEqual(E, C)) || (blendResult.y != BLEND_NONE && !IsPixEqual(E, G)) || (IsPixEqual(C, B) && IsPixEqual(B, A) && IsPixEqual(A, D) && IsPixEqual(D, G) && !IsPixEqual(E, A)))); float2 origin = float2(0.0, -1.0 / sqrt(2.0)); float2 direction = float2(-1.0, 1.0); if(doLineBlend) { bool haveShallowLine = (STEEP_DIRECTION_THRESHOLD * dist_D_C <= dist_B_G) && VECTOR_NEQ(E,C) && VECTOR_NEQ(F,C); bool haveSteepLine = (STEEP_DIRECTION_THRESHOLD * dist_B_G <= dist_D_C) && VECTOR_NEQ(E,G) && VECTOR_NEQ(H,G); origin = haveShallowLine? float2(0.0, -0.25) : float2(0.0, -0.5); direction.x -= haveShallowLine? 1.0: 0.0; direction.y += haveSteepLine? 1.0: 0.0; } float4 blendPix = lerp(D,B, step(DistYCbCr(E, B), DistYCbCr(E, D))); float blendW = lerp(Dw,Bw, step(DistYCbCr(E, B), DistYCbCr(E, D))); res = lerp(res, blendPix, get_left_ratio(pos, origin, direction, scale)); resW = lerp(resW, blendW, get_left_ratio(pos, origin, direction, scale)); } ialpha = res.w; texcol = float4(res.xyz, resW); // Compensate for partially transparent sampling. if (ialpha > 0.0) texcol.rgb /= float3(ialpha, ialpha, ialpha); #if BINALPHA ialpha = (ialpha >= 0.5) ? 1.0 : 0.0; #endif } #undef P )"; } } std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMode render_mode, GPUTransparencyMode transparency, GPUTextureMode texture_mode, bool dithering, bool interlacing, bool check_mask) { // TODO: don't write depth for shader blend DebugAssert(transparency == GPUTransparencyMode::Disabled || render_mode == GPU_HW::BatchRenderMode::ShaderBlend); const GPUTextureMode actual_texture_mode = texture_mode & ~GPUTextureMode::RawTextureBit; const bool raw_texture = (texture_mode & GPUTextureMode::RawTextureBit) == GPUTextureMode::RawTextureBit; const bool textured = (texture_mode != GPUTextureMode::Disabled); const bool shader_blending = (render_mode == GPU_HW::BatchRenderMode::ShaderBlend && (transparency != GPUTransparencyMode::Disabled || check_mask)); const bool use_dual_source = (!shader_blending && m_supports_dual_source_blend && ((render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled && render_mode != GPU_HW::BatchRenderMode::OnlyOpaque) || m_texture_filter != GPUTextureFilter::Nearest)); std::stringstream ss; WriteHeader(ss); DefineMacro(ss, "TRANSPARENCY", render_mode != GPU_HW::BatchRenderMode::TransparencyDisabled); DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", render_mode == GPU_HW::BatchRenderMode::OnlyOpaque); DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENT", render_mode == GPU_HW::BatchRenderMode::OnlyTransparent); DefineMacro(ss, "TRANSPARENCY_MODE", static_cast(transparency)); DefineMacro(ss, "SHADER_BLENDING", shader_blending); DefineMacro(ss, "CHECK_MASK_BIT", check_mask); DefineMacro(ss, "TEXTURED", textured); DefineMacro(ss, "PALETTE", actual_texture_mode == GPUTextureMode::Palette4Bit || actual_texture_mode == GPUTextureMode::Palette8Bit); DefineMacro(ss, "PALETTE_4_BIT", actual_texture_mode == GPUTextureMode::Palette4Bit); DefineMacro(ss, "PALETTE_8_BIT", actual_texture_mode == GPUTextureMode::Palette8Bit); DefineMacro(ss, "RAW_TEXTURE", raw_texture); DefineMacro(ss, "DITHERING", dithering); DefineMacro(ss, "DITHERING_SCALED", m_scaled_dithering); // Debanding requires true color to work correctly. DefineMacro(ss, "DEBANDING", m_true_color && m_debanding); DefineMacro(ss, "INTERLACING", interlacing); DefineMacro(ss, "TRUE_COLOR", m_true_color); DefineMacro(ss, "TEXTURE_FILTERING", m_texture_filter != GPUTextureFilter::Nearest); DefineMacro(ss, "UV_LIMITS", m_uv_limits); DefineMacro(ss, "USE_DUAL_SOURCE", use_dual_source); DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); WriteCommonFunctions(ss); WriteBatchUniformBuffer(ss); DeclareTexture(ss, "samp0", 0); if (m_glsl) ss << "CONSTANT int[16] s_dither_values = int[16]( "; else ss << "CONSTANT int s_dither_values[] = {"; for (u32 i = 0; i < 16; i++) { if (i > 0) ss << ", "; ss << DITHER_MATRIX[i / 4][i % 4]; } if (m_glsl) ss << " );\n"; else ss << "};\n"; ss << R"( uint3 ApplyDithering(uint2 coord, uint3 icol) { #if DITHERING_SCALED uint2 fc = coord & uint2(3u, 3u); #else uint2 fc = (coord / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & uint2(3u, 3u); #endif int offset = s_dither_values[fc.y * 4u + fc.x]; #if !TRUE_COLOR return uint3(clamp((int3(icol) + int3(offset, offset, offset)) >> 3, 0, 31)); #else return uint3(clamp(int3(icol) + int3(offset, offset, offset), 0, 255)); #endif } #if TEXTURED CONSTANT float4 TRANSPARENT_PIXEL_COLOR = float4(0.0, 0.0, 0.0, 0.0); uint2 ApplyTextureWindow(uint2 coords) { uint x = (uint(coords.x) & u_texture_window_and.x) | u_texture_window_or.x; uint y = (uint(coords.y) & u_texture_window_and.y) | u_texture_window_or.y; return uint2(x, y); } uint2 ApplyUpscaledTextureWindow(uint2 coords) { uint2 native_coords = coords / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); uint2 coords_offset = coords % uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); return (ApplyTextureWindow(native_coords) * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) + coords_offset; } uint2 FloatToIntegerCoords(float2 coords) { // With the vertex offset applied at 1x resolution scale, we want to round the texture coordinates. // Floor them otherwise, as it currently breaks when upscaling as the vertex offset is not applied. return uint2((RESOLUTION_SCALE == 1u) ? roundEven(coords) : floor(coords)); } float4 SampleFromVRAM(uint4 texpage, float2 coords) { #if PALETTE uint2 icoord = ApplyTextureWindow(FloatToIntegerCoords(coords)); uint2 index_coord = icoord; #if PALETTE_4_BIT index_coord.x /= 4u; #elif PALETTE_8_BIT index_coord.x /= 2u; #endif // fixup coords uint2 vicoord = texpage.xy + (index_coord * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)); // load colour/palette float4 texel = LOAD_TEXTURE(samp0, int2(vicoord), 0); uint vram_value = RGBA8ToRGBA5551(texel); // apply palette #if PALETTE_4_BIT uint subpixel = icoord.x & 3u; uint palette_index = (vram_value >> (subpixel * 4u)) & 0x0Fu; uint2 palette_icoord = uint2((texpage.z + palette_index) * RESOLUTION_SCALE, texpage.w); #elif PALETTE_8_BIT // can only wrap in X direction for 8-bit, 4-bit will fit in texpage size. uint subpixel = icoord.x & 1u; uint palette_index = (vram_value >> (subpixel * 8u)) & 0xFFu; uint2 palette_icoord = uint2(((texpage.z + palette_index) & 0x3FFu) * RESOLUTION_SCALE, texpage.w); #endif return LOAD_TEXTURE(samp0, int2(palette_icoord), 0); #else // Direct texturing. Render-to-texture effects. Use upscaled coordinates. uint2 icoord = ApplyUpscaledTextureWindow(FloatToIntegerCoords(coords)); uint2 direct_icoord = texpage.xy + icoord; return LOAD_TEXTURE(samp0, int2(direct_icoord), 0); #endif } #endif // From https://alex.vlachos.com/graphics/Alex_Vlachos_Advanced_VR_Rendering_GDC2015.pdf // and https://www.shadertoy.com/view/MslGR8 (5th one starting from the bottom) // NOTE: `frag_coord` is in pixels (i.e. not normalized UV). float3 ApplyDebanding(float2 frag_coord) { #if DEBANDING // Iestyn's RGB dither (7 asm instructions) from Portal 2 X360, slightly modified for VR. float ditherc = dot(vec2(171.0, 231.0), frag_coord); float3 dither = float3(ditherc, ditherc, ditherc); dither = fract(dither / float3(103.0, 71.0, 97.0)); // Subtract 0.5 to avoid slightly brightening the whole viewport. return (dither - 0.5) / 255.0; #else return float3(0.0, 0.0, 0.0); #endif } )"; if (textured) { if (m_texture_filter != GPUTextureFilter::Nearest) WriteBatchTextureFilter(ss, m_texture_filter); if (m_uv_limits) { DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float4 v_uv_limits"}}, true, use_dual_source ? 2 : 1, m_write_mask_as_depth, UsingMSAA(), UsingPerSampleShading(), false, m_disable_color_perspective, shader_blending); } else { DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "uint4 v_texpage"}}, true, use_dual_source ? 2 : 1, m_write_mask_as_depth, UsingMSAA(), UsingPerSampleShading(), false, m_disable_color_perspective, shader_blending); } } else { DeclareFragmentEntryPoint(ss, 1, 0, {}, true, use_dual_source ? 2 : 1, m_write_mask_as_depth, UsingMSAA(), UsingPerSampleShading(), false, m_disable_color_perspective, shader_blending); } ss << R"( { uint3 vertcol = uint3(v_col0.rgb * float3(255.0, 255.0, 255.0) + ApplyDebanding(v_pos.xy)); bool semitransparent; uint3 icolor; float ialpha; float oalpha; #if INTERLACING if ((uint(v_pos.y) & 1u) == u_interlaced_displayed_field) discard; #endif #if TEXTURED // We can't currently use upscaled coordinate for palettes because of how they're packed. // Not that it would be any benefit anyway, render-to-texture effects don't use palettes. float2 coords = v_tex0; #if PALETTE coords /= float2(RESOLUTION_SCALE, RESOLUTION_SCALE); #endif #if UV_LIMITS float4 uv_limits = v_uv_limits; #if !PALETTE // Extend the UV range to all "upscaled" pixels. This means 1-pixel-high polygon-based // framebuffer effects won't be downsampled. (e.g. Mega Man Legends 2 haze effect) uv_limits *= float(RESOLUTION_SCALE); uv_limits.zw += float(RESOLUTION_SCALE - 1u); #endif #endif float4 texcol; #if TEXTURE_FILTERING FilteredSampleFromVRAM(v_texpage, coords, uv_limits, texcol, ialpha); if (ialpha < 0.5) discard; #else #if UV_LIMITS texcol = SampleFromVRAM(v_texpage, clamp(coords, uv_limits.xy, uv_limits.zw)); #else texcol = SampleFromVRAM(v_texpage, coords); #endif if (VECTOR_EQ(texcol, TRANSPARENT_PIXEL_COLOR)) discard; ialpha = 1.0; #endif semitransparent = (texcol.a >= 0.5); // If not using true color, truncate the framebuffer colors to 5-bit. #if !TRUE_COLOR icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3; #if !RAW_TEXTURE icolor = (icolor * vertcol) >> 4; #if DITHERING icolor = ApplyDithering(uint2(v_pos.xy), icolor); #else icolor = min(icolor >> 3, uint3(31u, 31u, 31u)); #endif #endif #else icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0) + ApplyDebanding(v_pos.xy)); #if !RAW_TEXTURE icolor = (icolor * vertcol) >> 7; #if DITHERING icolor = ApplyDithering(uint2(v_pos.xy), icolor); #else icolor = min(icolor, uint3(255u, 255u, 255u)); #endif #endif #endif // Compute output alpha (mask bit) oalpha = float(u_set_mask_while_drawing ? 1 : int(semitransparent)); #else // All pixels are semitransparent for untextured polygons. semitransparent = true; icolor = vertcol; ialpha = 1.0; #if DITHERING icolor = ApplyDithering(uint2(v_pos.xy), icolor); #else #if !TRUE_COLOR icolor >>= 3; #endif #endif // However, the mask bit is cleared if set mask bit is false. oalpha = float(u_set_mask_while_drawing); #endif // Premultiply alpha so we don't need to use a colour output for it. float premultiply_alpha = ialpha; #if TRANSPARENCY && !SHADER_BLENDING premultiply_alpha = ialpha * (semitransparent ? u_src_alpha_factor : 1.0); #endif float3 color; #if !TRUE_COLOR // We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color // into the blend unit, which can cause a small amount of error to accumulate. color = floor(float3(icolor) * premultiply_alpha) / float3(31.0, 31.0, 31.0); #else // True color is actually simpler here since we want to preserve the precision. color = (float3(icolor) * premultiply_alpha) / float3(255.0, 255.0, 255.0); #endif #if SHADER_BLENDING float4 bg_col = LAST_FRAG_COLOR; float4 fg_col = float4(color, oalpha); #if CHECK_MASK_BIT if (bg_col.a != 0.0) discard; #endif #if TEXTURE_FILTERING #if TRANSPARENCY_MODE == 0 || TRANSPARENCY_MODE == 3 bg_col.rgb /= ialpha; #endif fg_col.rgb *= ialpha; #endif o_col0.a = fg_col.a; #if TRANSPARENCY_MODE == 0 // Half BG + Half FG. o_col0.rgb = (bg_col.rgb * 0.5) + (fg_col.rgb * 0.5); #elif TRANSPARENCY_MODE == 1 // BG + FG o_col0.rgb = bg_col.rgb + fg_col.rgb; #elif TRANSPARENCY_MODE == 2 // BG - FG o_col0.rgb = bg_col.rgb - fg_col.rgb; #elif TRANSPARENCY_MODE == 3 // BG + 1/4 FG. o_col0.rgb = bg_col.rgb + (fg_col.rgb * 0.25); #else o_col0.rgb = fg_col.rgb; #endif #if TRANSPARENCY // If pixel isn't marked as semitransparent, replace with previous colour. o_col0 = semitransparent ? o_col0 : fg_col; #endif #elif TRANSPARENCY && TEXTURED // Apply semitransparency. If not a semitransparent texel, destination alpha is ignored. if (semitransparent) { #if USE_DUAL_SOURCE o_col0 = float4(color, oalpha); o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha); #else o_col0 = float4(color, oalpha); #endif #if WRITE_MASK_AS_DEPTH o_depth = oalpha * v_pos.z; #endif #if TRANSPARENCY_ONLY_OPAQUE discard; #endif } else { #if USE_DUAL_SOURCE o_col0 = float4(color, oalpha); o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha); #else o_col0 = float4(color, oalpha); #endif #if WRITE_MASK_AS_DEPTH o_depth = oalpha * v_pos.z; #endif #if TRANSPARENCY_ONLY_TRANSPARENT discard; #endif } #elif TRANSPARENCY // We shouldn't be rendering opaque geometry only when untextured, so no need to test/discard here. #if USE_DUAL_SOURCE o_col0 = float4(color, oalpha); o_col1 = float4(0.0, 0.0, 0.0, u_dst_alpha_factor / ialpha); #else o_col0 = float4(color, oalpha); #endif #if WRITE_MASK_AS_DEPTH o_depth = oalpha * v_pos.z; #endif #else // Non-transparency won't enable blending so we can write the mask here regardless. o_col0 = float4(color, oalpha); #if USE_DUAL_SOURCE o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha); #endif #if WRITE_MASK_AS_DEPTH o_depth = oalpha * v_pos.z; #endif #endif } )"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateVRAMExtractFragmentShader(bool depth_24bit) { std::stringstream ss; WriteHeader(ss); DefineMacro(ss, "DEPTH_24BIT", depth_24bit); DefineMacro(ss, "MULTISAMPLED", UsingMSAA()); WriteCommonFunctions(ss); DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "uint u_skip_x", "uint u_line_skip"}, true); DeclareTexture(ss, "samp0", 0, UsingMSAA()); ss << R"( float4 LoadVRAM(int2 coords) { #if MULTISAMPLING float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u); FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++) value += LOAD_TEXTURE_MS(samp0, coords, sample_index); value /= float(MULTISAMPLES); return value; #else return LOAD_TEXTURE(samp0, coords, 0); #endif } float3 SampleVRAM24(uint2 icoords) { // load adjacent 16-bit texels uint2 clamp_size = uint2(1024, 512); // relative to start of scanout uint2 vram_coords = u_vram_offset + uint2((icoords.x * 3u) / 2u, icoords.y); uint s0 = RGBA8ToRGBA5551(LoadVRAM(int2((vram_coords % clamp_size) * RESOLUTION_SCALE))); uint s1 = RGBA8ToRGBA5551(LoadVRAM(int2(((vram_coords + uint2(1, 0)) % clamp_size) * RESOLUTION_SCALE))); // select which part of the combined 16-bit texels we are currently shading uint s1s0 = ((s1 << 16) | s0) >> ((icoords.x & 1u) * 8u); // extract components and normalize return float3(float(s1s0 & 0xFFu) / 255.0, float((s1s0 >> 8u) & 0xFFu) / 255.0, float((s1s0 >> 16u) & 0xFFu) / 255.0); } )"; DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1); ss << R"( { uint2 icoords = uint2(uint(v_pos.x) + u_skip_x, uint(v_pos.y) << u_line_skip); #if DEPTH_24BIT o_col0 = float4(SampleVRAM24(icoords), 1.0); #else o_col0 = float4(LoadVRAM(int2((icoords + u_vram_offset) % VRAM_SIZE)).rgb, 1.0); #endif } )"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateWireframeGeometryShader() { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); if (m_glsl) { ss << R"( layout(triangles) in; layout(line_strip, max_vertices = 6) out; void main() { gl_Position = gl_in[0].gl_Position; EmitVertex(); gl_Position = gl_in[1].gl_Position; EmitVertex(); EndPrimitive(); gl_Position = gl_in[1].gl_Position; EmitVertex(); gl_Position = gl_in[2].gl_Position; EmitVertex(); EndPrimitive(); gl_Position = gl_in[2].gl_Position; EmitVertex(); gl_Position = gl_in[0].gl_Position; EmitVertex(); EndPrimitive(); } )"; } else { ss << R"( struct GSInput { float4 col0 : COLOR0; float4 pos : SV_Position; }; struct GSOutput { float4 pos : SV_Position; }; GSOutput GetVertex(GSInput vi) { GSOutput vo; vo.pos = vi.pos; return vo; } [maxvertexcount(6)] void main(triangle GSInput input[3], inout LineStream output) { output.Append(GetVertex(input[0])); output.Append(GetVertex(input[1])); output.RestartStrip(); output.Append(GetVertex(input[1])); output.Append(GetVertex(input[2])); output.RestartStrip(); output.Append(GetVertex(input[2])); output.Append(GetVertex(input[0])); output.RestartStrip(); } )"; } return ss.str(); } std::string GPU_HW_ShaderGen::GenerateWireframeFragmentShader() { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); DeclareFragmentEntryPoint(ss, 0, 0, {}, false, 1); ss << R"( { o_col0 = float4(1.0, 1.0, 1.0, 0.5); } )"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateVRAMReadFragmentShader() { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_size"}, true); DeclareTexture(ss, "samp0", 0, UsingMSAA()); ss << R"( float4 LoadVRAM(int2 coords) { #if MULTISAMPLING float4 value = LOAD_TEXTURE_MS(samp0, coords, 0u); FOR_UNROLL (uint sample_index = 1u; sample_index < MULTISAMPLES; sample_index++) value += LOAD_TEXTURE_MS(samp0, coords, sample_index); value /= float(MULTISAMPLES); return value; #else return LOAD_TEXTURE(samp0, coords, 0); #endif } uint SampleVRAM(uint2 coords) { if (RESOLUTION_SCALE == 1u) return RGBA8ToRGBA5551(LoadVRAM(int2(coords))); // Box filter for downsampling. float4 value = float4(0.0, 0.0, 0.0, 0.0); uint2 base_coords = coords * uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); for (uint offset_x = 0u; offset_x < RESOLUTION_SCALE; offset_x++) { for (uint offset_y = 0u; offset_y < RESOLUTION_SCALE; offset_y++) value += LoadVRAM(int2(base_coords + uint2(offset_x, offset_y))); } value /= float(RESOLUTION_SCALE * RESOLUTION_SCALE); return RGBA8ToRGBA5551(value); } )"; DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1); ss << R"( { uint2 sample_coords = uint2(uint(v_pos.x) * 2u, uint(v_pos.y)); sample_coords += u_base_coords; // We're encoding as 32-bit, so the output width is halved and we pack two 16-bit pixels in one 32-bit pixel. uint left = SampleVRAM(sample_coords); uint right = SampleVRAM(uint2(sample_coords.x + 1u, sample_coords.y)); o_col0 = float4(float(left & 0xFFu), float((left >> 8) & 0xFFu), float(right & 0xFFu), float((right >> 8) & 0xFFu)) / float4(255.0, 255.0, 255.0, 255.0); })"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_buffer, bool use_ssbo) { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); DefineMacro(ss, "USE_BUFFER", use_buffer); DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_end_coords", "uint2 u_size", "uint u_buffer_base_offset", "uint u_mask_or_bits", "float u_depth_value"}, true); if (!use_buffer) { DeclareTexture(ss, "samp0", 0, false, true, true); } else if (use_ssbo && m_glsl) { ss << "layout(std430"; if (IsVulkan()) ss << ", set = 0, binding = 0"; else if (IsMetal()) ss << ", set = 0, binding = 1"; else if (m_use_glsl_binding_layout) ss << ", binding = 0"; ss << ") readonly restrict buffer SSBO {\n"; ss << " uint ssbo_data[];\n"; ss << "};\n\n"; ss << "#define GET_VALUE(buffer_offset) (ssbo_data[(buffer_offset) / 2u] >> (((buffer_offset) % 2u) * 16u))\n\n"; } else { DeclareTextureBuffer(ss, "samp0", 0, true, true); ss << "#define GET_VALUE(buffer_offset) (LOAD_TEXTURE_BUFFER(samp0, int(buffer_offset)).r)\n\n"; } DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, m_write_mask_as_depth); ss << R"( { uint2 coords = uint2(v_pos.xy) / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); // make sure it's not oversized and out of range if ((coords.x < u_base_coords.x && coords.x >= u_end_coords.x) || (coords.y < u_base_coords.y && coords.y >= u_end_coords.y)) { discard; } // find offset from the start of the row/column uint2 offset; offset.x = (coords.x < u_base_coords.x) ? ((VRAM_SIZE.x / RESOLUTION_SCALE) - u_base_coords.x + coords.x) : (coords.x - u_base_coords.x); offset.y = (coords.y < u_base_coords.y) ? ((VRAM_SIZE.y / RESOLUTION_SCALE) - u_base_coords.y + coords.y) : (coords.y - u_base_coords.y); #if !USE_BUFFER uint value = LOAD_TEXTURE(samp0, int2(offset), 0).x; #else uint buffer_offset = u_buffer_base_offset + (offset.y * u_size.x) + offset.x; uint value = GET_VALUE(buffer_offset) | u_mask_or_bits; #endif o_col0 = RGBA5551ToRGBA8(value); #if WRITE_MASK_AS_DEPTH o_depth = (o_col0.a == 1.0) ? u_depth_value : 0.0; #endif })"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader() { // TODO: This won't currently work because we can't bind the texture to both the shader and framebuffer. const bool msaa = false; std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); DeclareUniformBuffer(ss, {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_end_coords", "uint2 u_size", "bool u_set_mask_bit", "float u_depth_value"}, true); DeclareTexture(ss, "samp0", 0, msaa); DefineMacro(ss, "MSAA_COPY", msaa); DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, m_write_mask_as_depth, false, false, msaa); ss << R"( { uint2 dst_coords = uint2(v_pos.xy); // make sure it's not oversized and out of range if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) || (dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y)) { discard; } // find offset from the start of the row/column uint2 offset; offset.x = (dst_coords.x < u_dst_coords.x) ? (VRAM_SIZE.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x); offset.y = (dst_coords.y < u_dst_coords.y) ? (VRAM_SIZE.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y); // find the source coordinates to copy from uint2 src_coords = (u_src_coords + offset) % VRAM_SIZE; // sample and apply mask bit #if MSAA_COPY float4 color = LOAD_TEXTURE_MS(samp0, int2(src_coords), f_sample_index); #else float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0); #endif o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a); #if WRITE_MASK_AS_DEPTH o_depth = (u_set_mask_bit ? 1.0f : ((o_col0.a == 1.0) ? u_depth_value : 0.0)); #endif })"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateVRAMFillFragmentShader(bool wrapped, bool interlaced) { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); DefineMacro(ss, "WRITE_MASK_AS_DEPTH", m_write_mask_as_depth); DefineMacro(ss, "WRAPPED", wrapped); DefineMacro(ss, "INTERLACED", interlaced); DeclareUniformBuffer( ss, {"uint2 u_dst_coords", "uint2 u_end_coords", "float4 u_fill_color", "uint u_interlaced_displayed_field"}, true); DeclareFragmentEntryPoint(ss, 0, 1, {}, interlaced || wrapped, 1, m_write_mask_as_depth, false, false, false); ss << R"( { #if INTERLACED || WRAPPED uint2 dst_coords = uint2(v_pos.xy); #endif #if INTERLACED if ((dst_coords.y & 1u) == u_interlaced_displayed_field) discard; #endif #if WRAPPED // make sure it's not oversized and out of range if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) || (dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y)) { discard; } #endif o_col0 = u_fill_color; #if WRITE_MASK_AS_DEPTH o_depth = u_fill_color.a; #endif })"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateVRAMUpdateDepthFragmentShader() { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); DeclareTexture(ss, "samp0", 0, UsingMSAA()); DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 0, true, false, false, UsingMSAA()); ss << R"( { #if MULTISAMPLING o_depth = LOAD_TEXTURE_MS(samp0, int2(v_pos.xy), f_sample_index).a; #else o_depth = LOAD_TEXTURE(samp0, int2(v_pos.xy), 0).a; #endif } )"; return ss.str(); } void GPU_HW_ShaderGen::WriteAdaptiveDownsampleUniformBuffer(std::stringstream& ss) { DeclareUniformBuffer(ss, {"float2 u_uv_min", "float2 u_uv_max", "float2 u_rcp_resolution", "float u_lod"}, true); } std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleVertexShader() { std::stringstream ss; WriteHeader(ss); WriteAdaptiveDownsampleUniformBuffer(ss); DeclareVertexEntryPoint(ss, {}, 0, 1, {}, true); ss << R"( { v_tex0 = float2(float((v_id << 1) & 2u), float(v_id & 2u)); v_pos = float4(v_tex0 * float2(2.0f, -2.0f) + float2(-1.0f, 1.0f), 0.0f, 1.0f); v_tex0 = u_uv_min + (u_uv_max - u_uv_min) * v_tex0; #if API_OPENGL || API_OPENGL_ES || API_VULKAN v_pos.y = -v_pos.y; #endif } )"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleMipFragmentShader(bool first_pass) { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); WriteAdaptiveDownsampleUniformBuffer(ss); DeclareTexture(ss, "samp0", 0, false); DefineMacro(ss, "FIRST_PASS", first_pass); // mipmap_energy.glsl ported from parallel-rsx. ss << R"( float4 get_bias(float3 c00, float3 c01, float3 c10, float3 c11) { // Measure the "energy" (variance) in the pixels. // If the pixels are all the same (2D content), use maximum bias, otherwise, taper off quickly back to 0 (edges) float3 avg = 0.25 * (c00 + c01 + c10 + c11); float s00 = dot(c00 - avg, c00 - avg); float s01 = dot(c01 - avg, c01 - avg); float s10 = dot(c10 - avg, c10 - avg); float s11 = dot(c11 - avg, c11 - avg); return float4(avg, 1.0 - log2(1000.0 * (s00 + s01 + s10 + s11) + 1.0)); } float4 get_bias(float4 c00, float4 c01, float4 c10, float4 c11) { // Measure the "energy" (variance) in the pixels. // If the pixels are all the same (2D content), use maximum bias, otherwise, taper off quickly back to 0 (edges) float avg = 0.25 * (c00.a + c01.a + c10.a + c11.a); float4 bias = get_bias(c00.rgb, c01.rgb, c10.rgb, c11.rgb); bias.a *= avg; return bias; } )"; DeclareFragmentEntryPoint(ss, 0, 1, {}, false, 1, false, false, false, false); ss << R"( { float2 uv = v_tex0 - (u_rcp_resolution * 0.25); #ifdef FIRST_PASS vec3 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0)).rgb; vec3 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1)).rgb; vec3 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0)).rgb; vec3 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1)).rgb; o_col0 = get_bias(c00, c01, c10, c11); #else vec4 c00 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 0)); vec4 c01 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(0, 1)); vec4 c10 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 0)); vec4 c11 = SAMPLE_TEXTURE_LEVEL_OFFSET(samp0, uv, u_lod, int2(1, 1)); o_col0 = get_bias(c00, c01, c10, c11); #endif } )"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleBlurFragmentShader() { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); WriteAdaptiveDownsampleUniformBuffer(ss); DeclareTexture(ss, "samp0", 0, false); // mipmap_blur.glsl ported from parallel-rsx. DeclareFragmentEntryPoint(ss, 0, 1, {}, false, 1, false, false, false, false); ss << R"( { float bias = 0.0; const float w0 = 0.25; const float w1 = 0.125; const float w2 = 0.0625; #define UV(x, y) clamp((v_tex0 + float2(x, y) * u_rcp_resolution), u_uv_min, u_uv_max) bias += w2 * SAMPLE_TEXTURE(samp0, UV(-1.0, -1.0)).a; bias += w2 * SAMPLE_TEXTURE(samp0, UV(+1.0, -1.0)).a; bias += w2 * SAMPLE_TEXTURE(samp0, UV(-1.0, +1.0)).a; bias += w2 * SAMPLE_TEXTURE(samp0, UV(+1.0, +1.0)).a; bias += w1 * SAMPLE_TEXTURE(samp0, UV( 0.0, -1.0)).a; bias += w1 * SAMPLE_TEXTURE(samp0, UV(-1.0, 0.0)).a; bias += w1 * SAMPLE_TEXTURE(samp0, UV(+1.0, 0.0)).a; bias += w1 * SAMPLE_TEXTURE(samp0, UV( 0.0, +1.0)).a; bias += w0 * SAMPLE_TEXTURE(samp0, UV( 0.0, 0.0)).a; o_col0 = float4(bias, bias, bias, bias); } )"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateAdaptiveDownsampleCompositeFragmentShader() { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); DeclareTexture(ss, "samp0", 0, false); DeclareTexture(ss, "samp1", 1, false); // mipmap_resolve.glsl ported from parallel-rsx. DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, false, false, false); ss << R"( { float bias = SAMPLE_TEXTURE(samp1, v_tex0).r; float mip = float(RESOLUTION_SCALE - 1u) * bias; float3 color = SAMPLE_TEXTURE_LEVEL(samp0, v_tex0, mip).rgb; o_col0 = float4(color, 1.0); } )"; return ss.str(); } std::string GPU_HW_ShaderGen::GenerateBoxSampleDownsampleFragmentShader(u32 factor) { std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); DeclareUniformBuffer(ss, {"uint2 u_base_coords"}, true); DeclareTexture(ss, "samp0", 0, false); ss << "#define FACTOR " << factor << "\n"; DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, false, false, false); ss << R"( { float3 color = float3(0.0, 0.0, 0.0); uint2 base_coords = u_base_coords + uint2(v_pos.xy) * uint2(FACTOR, FACTOR); for (uint offset_x = 0u; offset_x < FACTOR; offset_x++) { for (uint offset_y = 0u; offset_y < FACTOR; offset_y++) color += LOAD_TEXTURE(samp0, int2(base_coords + uint2(offset_x, offset_y)), 0).rgb; } color /= float(FACTOR * FACTOR); o_col0 = float4(color, 1.0); } )"; return ss.str(); }