Add a new port of crt-royale.fx (#3260)

- A new port of crt-royale. More faithful to original. It uses the same mask textures.
- The only thing not ported is the original geometry pass. It was replaced by geom curvature code.
- It's configured for 1080p displays. 4k displays need to adjust param mask_triad_size_desired from 3.0 to 4.0.

OBS: It's up to you decide if the two versions should be maintained.
This commit is contained in:
Hyllian 2024-07-26 01:33:01 -03:00 committed by GitHub
parent e455a5e371
commit bf1b023f12
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 8404 additions and 2 deletions

View file

@ -0,0 +1,494 @@
#include "ReShade.fxh"
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
// Ported to Duckstation (ReShade specs) by Hyllian (2024).
// Set shader params for all passes here:
uniform float crt_gamma <
ui_type = "drag";
ui_min = 1.0;
ui_max = 5.0;
ui_step = 0.025;
ui_label = "Simulated CRT Gamma";
> = 2.5;
uniform float lcd_gamma <
ui_type = "drag";
ui_min = 1.0;
ui_max = 5.0;
ui_step = 0.025;
ui_label = "Your Display Gamma";
> = 2.2;
uniform float levels_contrast <
ui_type = "drag";
ui_min = 0.0;
ui_max = 4.0;
ui_step = 0.015625;
ui_label = "Contrast";
> = 1.0;
uniform float halation_weight <
ui_type = "drag";
ui_min = 0.0;
ui_max = 1.0;
ui_step = 0.005;
ui_label = "Halation Weight";
> = 0.0;
uniform float diffusion_weight <
ui_type = "drag";
ui_min = 0.0;
ui_max = 1.0;
ui_step = 0.005;
ui_label = "Diffusion Weight";
> = 0.075;
uniform float bloom_underestimate_levels <
ui_type = "drag";
ui_min = 0.0;
ui_max = 5.0;
ui_step = 0.01;
ui_label = "Bloom - Underestimate Levels";
> = 0.8;
uniform float bloom_excess <
ui_type = "drag";
ui_min = 0.0;
ui_max = 1.0;
ui_step = 0.005;
ui_label = "Bloom - Excess";
> = 0.0;
uniform float beam_min_sigma <
ui_type = "drag";
ui_min = 0.005;
ui_max = 1.0;
ui_step = 0.005;
ui_label = "Beam - Min Sigma";
> = 0.02;
uniform float beam_max_sigma <
ui_type = "drag";
ui_min = 0.005;
ui_max = 1.0;
ui_step = 0.005;
ui_label = "Beam - Max Sigma";
> = 0.3;
uniform float beam_spot_power <
ui_type = "drag";
ui_min = 0.01;
ui_max = 16.0;
ui_step = 0.01;
ui_label = "Beam - Spot Power";
> = 0.33;
uniform float beam_min_shape <
ui_type = "drag";
ui_min = 2.0;
ui_max = 32.0;
ui_step = 0.1;
ui_label = "Beam - Min Shape";
> = 2.0;
uniform float beam_max_shape <
ui_type = "drag";
ui_min = 2.0;
ui_max = 32.0;
ui_step = 0.1;
ui_label = "Beam - Max Shape";
> = 4.0;
uniform float beam_shape_power <
ui_type = "drag";
ui_min = 0.01;
ui_max = 16.0;
ui_step = 0.01;
ui_label = "Beam - Shape Power";
> = 0.25;
uniform int beam_horiz_filter <
ui_type = "combo";
ui_items = "Quilez (fast)\0Gaussian (slow)\0Lanczos (medium)\0";
ui_label = "Beam - Horiz Filter";
> = 0;
uniform float beam_horiz_sigma <
ui_type = "drag";
ui_min = 0.0;
ui_max = 0.67;
ui_step = 0.005;
ui_label = "Beam - Horiz Sigma";
> = 0.35;
uniform float beam_horiz_linear_rgb_weight <
ui_type = "drag";
ui_min = 0.0;
ui_max = 1.0;
ui_step = 0.01;
ui_label = "Beam - Horiz Linear RGB Weight";
> = 1.0;
uniform float convergence_offset_x_r <
ui_type = "drag";
ui_min = -4.0;
ui_max = 4.0;
ui_step = 0.05;
ui_label = "Convergence - Offset X Red";
> = 0.0;
uniform float convergence_offset_x_g <
ui_type = "drag";
ui_min = -4.0;
ui_max = 4.0;
ui_step = 0.05;
ui_label = "Convergence - Offset X Green";
> = 0.0;
uniform float convergence_offset_x_b <
ui_type = "drag";
ui_min = -4.0;
ui_max = 4.0;
ui_step = 0.05;
ui_label = "Convergence - Offset X Blue";
> = 0.0;
uniform float convergence_offset_y_r <
ui_type = "drag";
ui_min = -2.0;
ui_max = 2.0;
ui_step = 0.05;
ui_label = "Convergence - Offset Y Red";
> = 0.0;
uniform float convergence_offset_y_g <
ui_type = "drag";
ui_min = -2.0;
ui_max = 2.0;
ui_step = 0.05;
ui_label = "Convergence - Offset Y Green";
> = 0.0;
uniform float convergence_offset_y_b <
ui_type = "drag";
ui_min = -2.0;
ui_max = 2.0;
ui_step = 0.05;
ui_label = "Convergence - Offset Y Blue";
> = 0.0;
uniform int mask_type <
ui_type = "combo";
ui_items = "Aperture Grille\0Slot Mask\0Shadow Mask\0";
ui_label = "Mask - Type";
> = 0;
uniform float mask_sample_mode_desired <
ui_type = "drag";
ui_min = 0.0;
ui_max = 2.0;
ui_step = 1.;
ui_label = "Mask - Sample Mode";
> = 0.0;
uniform float mask_specify_num_triads <
ui_type = "drag";
ui_min = 0.0;
ui_max = 1.0;
ui_step = 1.0;
ui_label = "Mask - Specify Number of Triads";
> = 0.0;
uniform float mask_triad_size_desired <
ui_type = "drag";
ui_min = 1.0;
ui_max = 18.0;
ui_step = 0.125;
ui_label = "Mask - Triad Size Desired";
> = 3.0;
uniform float mask_num_triads_desired <
ui_type = "drag";
ui_min = 342.0;
ui_max = 1920.0;
ui_step = 1.0;
ui_label = "Mask - Number of Triads Desired";
> = 480.0;
uniform bool interlace_detect <
ui_type = "radio";
ui_label = "Enable Interlacing Detection";
ui_category = "Interlacing";
> = true;
uniform bool interlace_bff <
ui_type = "radio";
ui_label = "Bottom Field First";
ui_category = "Interlacing";
> = false;
uniform bool interlace_1080i <
ui_type = "radio";
ui_label = "Detect 1080i";
ui_category = "Interlacing";
> = false;
uniform float FrameCount < source = "framecount"; >;
uniform float2 BufferToViewportRatio < source = "buffer_to_viewport_ratio"; >;
uniform float2 InternalPixelSize < source = "internal_pixel_size"; >;
uniform float2 NativePixelSize < source = "native_pixel_size"; >;
uniform float2 NormalizedInternalPixelSize < source = "normalized_internal_pixel_size"; >;
uniform float2 NormalizedNativePixelSize < source = "normalized_native_pixel_size"; >;
uniform float UpscaleMultiplier < source = "upscale_multiplier"; >;
uniform float2 ViewportSize < source = "viewportsize"; >;
uniform float ViewportWidth < source = "viewportwidth"; >;
uniform float ViewportHeight < source = "viewportheight"; >;
#include "../misc/include/geom.fxh"
#define VIEWPORT_SIZE (ViewportSize*BufferToViewportRatio)
#define TEXTURE_SIZE (1.0/NormalizedNativePixelSize)
#define ORIG_LINEARIZED_texture_size TEXTURE_SIZE
#define VERTICAL_SCANLINES_texture_size TEXTURE_SIZE
#define BLOOM_APPROX_texture_size TEXTURE_SIZE
#define BLUR9FAST_VERTICAL_texture_size TEXTURE_SIZE
#define HALATION_BLUR_texture_size TEXTURE_SIZE
#define MASK_RESIZE_VERT_texture_size TEXTURE_SIZE
#define MASK_RESIZE_texture_size float2(64.0,0.0625*((VIEWPORT_SIZE).y))
#define MASKED_SCANLINES_texture_size (0.0625*VIEWPORT_SIZE)
#define BRIGHTPASS_texture_size VIEWPORT_SIZE
#define BLOOM_VERTICAL_texture_size VIEWPORT_SIZE
#define BLOOM_HORIZONTAL_texture_size VIEWPORT_SIZE
#define ORIG_LINEARIZED_video_size ORIG_LINEARIZED_texture_size
#define VERTICAL_SCANLINES_video_size VERTICAL_SCANLINES_texture_size
#define BLOOM_APPROX_video_size BLOOM_APPROX_texture_size
#define BLUR9FAST_VERTICAL_video_size BLUR9FAST_VERTICAL_texture_size
#define HALATION_BLUR_video_size HALATION_BLUR_texture_size
#define MASK_RESIZE_VERT_video_size MASK_RESIZE_VERT_texture_size
#define MASK_RESIZE_video_size MASK_RESIZE_texture_size
#define MASKED_SCANLINES_video_size MASKED_SCANLINES_texture_size
#define BRIGHTPASS_video_size BRIGHTPASS_texture_size
#define BLOOM_VERTICAL_video_size BLOOM_VERTICAL_texture_size
#define BLOOM_HORIZONTAL_video_size BLOOM_HORIZONTAL_texture_size
#define video_size texture_size
texture2D tmask_grille_texture_small < source = "crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png"; > {Width=64.0;Height=64.0;MipLevels=0;};
texture2D tmask_slot_texture_small < source = "crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png"; > {Width=64.0;Height=64.0;MipLevels=0;};
texture2D tmask_shadow_texture_small < source = "crt-royale/TileableLinearShadowMaskEDPResizeTo64.png"; > {Width=64.0;Height=64.0;MipLevels=0;};
texture2D tmask_grille_texture_large < source = "crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png"; > {Width=512.0;Height=512.0;MipLevels=4;};
texture2D tmask_slot_texture_large < source = "crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png"; > {Width=512.0;Height=512.0;MipLevels=4;};
texture2D tmask_shadow_texture_large < source = "crt-royale/TileableLinearShadowMaskEDP.png"; > {Width=512.0;Height=512.0;MipLevels=4;};
sampler2D mask_grille_texture_small { Texture = tmask_grille_texture_small; AddressU = REPEAT; AddressV = REPEAT; MinFilter = POINT; MagFilter = POINT;};
sampler2D mask_slot_texture_small { Texture = tmask_slot_texture_small; AddressU = REPEAT; AddressV = REPEAT; MinFilter = POINT; MagFilter = POINT;};
sampler2D mask_shadow_texture_small { Texture = tmask_shadow_texture_small; AddressU = REPEAT; AddressV = REPEAT; MinFilter = POINT; MagFilter = POINT;};
sampler2D mask_grille_texture_large { Texture = tmask_grille_texture_large; AddressU = REPEAT; AddressV = REPEAT; MinFilter = POINT; MagFilter = POINT;};
sampler2D mask_slot_texture_large { Texture = tmask_slot_texture_large; AddressU = REPEAT; AddressV = REPEAT; MinFilter = POINT; MagFilter = POINT;};
sampler2D mask_shadow_texture_large { Texture = tmask_shadow_texture_large; AddressU = REPEAT; AddressV = REPEAT; MinFilter = POINT; MagFilter = POINT;};
#ifndef DEBUG_PASSES
#define DEBUG_PASSES 11
#endif
texture2D tORIG_LINEARIZED{Width=BUFFER_WIDTH;Height=BUFFER_HEIGHT;Format=RGBA16f;};
sampler2D ORIG_LINEARIZED{Texture=tORIG_LINEARIZED;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=LINEAR;MinFilter=LINEAR;};
#if (DEBUG_PASSES > 1)
texture2D tVERTICAL_SCANLINES{Width=BUFFER_WIDTH;Height=BUFFER_HEIGHT;Format=RGBA16f;};
sampler2D VERTICAL_SCANLINES{Texture=tVERTICAL_SCANLINES;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=LINEAR;MinFilter=LINEAR;};
#endif
#if (DEBUG_PASSES > 2)
texture2D tBLOOM_APPROX{Width=BUFFER_WIDTH;Height=BUFFER_HEIGHT;Format=RGBA16f;};
sampler2D BLOOM_APPROX{Texture=tBLOOM_APPROX;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=LINEAR;MinFilter=LINEAR;};
#endif
#if (DEBUG_PASSES > 3)
// Need checking if it's really necessary to rendertarget.
texture2D tBLUR9FAST_VERTICAL{Width=BUFFER_WIDTH;Height=BUFFER_HEIGHT;Format=RGBA16f;};
sampler2D BLUR9FAST_VERTICAL{Texture=tBLUR9FAST_VERTICAL;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=LINEAR;MinFilter=LINEAR;};
#endif
#if (DEBUG_PASSES > 4)
texture2D tHALATION_BLUR{Width=BUFFER_WIDTH;Height=BUFFER_HEIGHT;Format=RGBA16f;};
sampler2D HALATION_BLUR{Texture=tHALATION_BLUR;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=LINEAR;MinFilter=LINEAR;};
#endif
#if (DEBUG_PASSES > 5)
texture2D tMASK_RESIZE_VERTICAL{Width=64.0;Height=BUFFER_HEIGHT*0.0625;Format=RGBA8;};
sampler2D MASK_RESIZE_VERTICAL{Texture=tMASK_RESIZE_VERTICAL;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=POINT;MinFilter=POINT;};
#endif
#if (DEBUG_PASSES > 6)
texture2D tMASK_RESIZE{Width=BUFFER_WIDTH*0.0625;Height=BUFFER_HEIGHT*0.0625;Format=RGBA8;};
sampler2D MASK_RESIZE{Texture=tMASK_RESIZE;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=POINT;MinFilter=POINT;};
#endif
#if (DEBUG_PASSES > 7)
texture2D tMASKED_SCANLINES{Width=BUFFER_WIDTH;Height=BUFFER_HEIGHT;Format=RGBA16f;};
sampler2D MASKED_SCANLINES{Texture=tMASKED_SCANLINES;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=LINEAR;MinFilter=LINEAR;};
#endif
#if (DEBUG_PASSES > 8)
texture2D tBRIGHTPASS{Width=BUFFER_WIDTH;Height=BUFFER_HEIGHT;Format=RGBA16f;};
sampler2D BRIGHTPASS{Texture=tBRIGHTPASS;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=LINEAR;MinFilter=LINEAR;};
#endif
#if (DEBUG_PASSES > 9)
texture2D tBLOOM_VERTICAL{Width=BUFFER_WIDTH;Height=BUFFER_HEIGHT;Format=RGBA16f;};
sampler2D BLOOM_VERTICAL{Texture=tBLOOM_VERTICAL;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=LINEAR;MinFilter=LINEAR;};
#endif
#include "crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.fxh"
#if (DEBUG_PASSES > 1)
#include "crt-royale/src/crt-royale-scanlines-vertical-interlacing.fxh"
#endif
#if (DEBUG_PASSES > 2)
#include "crt-royale/src/crt-royale-bloom-approx.fxh"
#endif
#if (DEBUG_PASSES > 3)
#include "crt-royale/src/blur9fast-vertical.fxh"
#endif
#if (DEBUG_PASSES > 4)
#include "crt-royale/src/blur9fast-horizontal.fxh"
#endif
#if (DEBUG_PASSES > 5)
#include "crt-royale/src/crt-royale-mask-resize-vertical.fxh"
#endif
#if (DEBUG_PASSES > 6)
#include "crt-royale/src/crt-royale-mask-resize-horizontal.fxh"
#endif
#if (DEBUG_PASSES > 7)
#include "crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.fxh"
#endif
#if (DEBUG_PASSES > 8)
#include "crt-royale/src/crt-royale-brightpass.fxh"
#endif
#if (DEBUG_PASSES > 9)
#include "crt-royale/src/crt-royale-bloom-vertical.fxh"
#endif
#if (DEBUG_PASSES > 10)
#include "crt-royale/src/crt-royale-bloom-horizontal-reconstitute.fxh"
#endif
technique CRT_Royale
{
pass
{
VertexShader = VS_Linearize;
PixelShader = PS_Linearize;
RenderTarget = tORIG_LINEARIZED;
}
#if (DEBUG_PASSES > 1)
pass
{
VertexShader = VS_Scanlines_Vertical_Interlacing;
PixelShader = PS_Scanlines_Vertical_Interlacing;
RenderTarget = tVERTICAL_SCANLINES;
}
#endif
#if (DEBUG_PASSES > 2)
pass
{
VertexShader = VS_Bloom_Approx;
PixelShader = PS_Bloom_Approx;
RenderTarget = tBLOOM_APPROX;
}
#endif
#if (DEBUG_PASSES > 3)
pass
{
VertexShader = VS_Blur9Fast_Vertical;
PixelShader = PS_Blur9Fast_Vertical;
RenderTarget = tBLUR9FAST_VERTICAL;
}
#endif
#if (DEBUG_PASSES > 4)
pass
{
VertexShader = VS_Blur9Fast_Horizontal;
PixelShader = PS_Blur9Fast_Horizontal;
RenderTarget = tHALATION_BLUR;
}
#endif
#if (DEBUG_PASSES > 5)
pass
{
VertexShader = VS_Mask_Resize_Vertical;
PixelShader = PS_Mask_Resize_Vertical;
RenderTarget = tMASK_RESIZE_VERTICAL;
}
#endif
#if (DEBUG_PASSES > 6)
pass
{
VertexShader = VS_Mask_Resize_Horizontal;
PixelShader = PS_Mask_Resize_Horizontal;
RenderTarget = tMASK_RESIZE;
}
#endif
#if (DEBUG_PASSES > 7)
pass
{
VertexShader = VS_Scanlines_Horizontal_Apply_Mask;
PixelShader = PS_Scanlines_Horizontal_Apply_Mask;
RenderTarget = tMASKED_SCANLINES;
}
#endif
#if (DEBUG_PASSES > 8)
pass
{
VertexShader = VS_Brightpass;
PixelShader = PS_Brightpass;
RenderTarget = tBRIGHTPASS;
}
#endif
#if (DEBUG_PASSES > 9)
pass
{
VertexShader = VS_Bloom_Vertical;
PixelShader = PS_Bloom_Vertical;
RenderTarget = tBLOOM_VERTICAL;
}
#endif
#if (DEBUG_PASSES > 10)
pass
{
VertexShader = VS_Bloom_Horizontal;
PixelShader = PS_Bloom_Horizontal;
}
#endif
}

View file

@ -0,0 +1,280 @@
GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS

View file

@ -0,0 +1,249 @@
#ifndef BIND_SHADER_PARAMS_H
#define BIND_SHADER_PARAMS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "helper-functions-and-macros.fxh"
#include "user-settings.fxh"
#include "derived-settings-and-constants.fxh"
// Override some parameters for gamma-management.h and tex2Dantialias.h:
#define OVERRIDE_DEVICE_GAMMA
static const float gba_gamma = 3.5; // Irrelevant but necessary to define.
#define ANTIALIAS_OVERRIDE_BASICS
#define ANTIALIAS_OVERRIDE_PARAMETERS
// Disable runtime shader params if the user doesn't explicitly want them.
// Static constants will be defined in place of uniforms of the same name.
#ifndef RUNTIME_SHADER_PARAMS_ENABLE
#undef PARAMETER_UNIFORM
#endif
// Bind option names to shader parameter uniforms or static constants.
#ifdef PARAMETER_UNIFORM
uniform float crt_gamma;
uniform float lcd_gamma;
uniform float levels_contrast;
uniform float halation_weight;
uniform float diffusion_weight;
uniform float bloom_underestimate_levels;
uniform float bloom_excess;
uniform float beam_min_sigma;
uniform float beam_max_sigma;
uniform float beam_spot_power;
uniform float beam_min_shape;
uniform float beam_max_shape;
uniform float beam_shape_power;
uniform float beam_horiz_sigma;
#ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
uniform float beam_horiz_filter;
uniform float beam_horiz_linear_rgb_weight;
#else
static const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
static const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
#endif
uniform float convergence_offset_x_r;
uniform float convergence_offset_x_g;
uniform float convergence_offset_x_b;
uniform float convergence_offset_y_r;
uniform float convergence_offset_y_g;
uniform float convergence_offset_y_b;
#ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
uniform float mask_type;
#else
static const float mask_type = clamp(mask_type_static, 0.0, 2.0);
#endif
uniform float mask_sample_mode_desired;
uniform float mask_specify_num_triads;
uniform float mask_triad_size_desired;
uniform float mask_num_triads_desired;
uniform float aa_subpixel_r_offset_x_runtime;
uniform float aa_subpixel_r_offset_y_runtime;
#ifdef RUNTIME_ANTIALIAS_WEIGHTS
uniform float aa_cubic_c;
uniform float aa_gauss_sigma;
#else
static const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]?
static const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]?
#endif
uniform float geom_mode_runtime;
uniform float geom_radius;
uniform float geom_view_dist;
uniform float geom_tilt_angle_x;
uniform float geom_tilt_angle_y;
uniform float geom_aspect_ratio_x;
uniform float geom_aspect_ratio_y;
uniform float geom_overscan_x;
uniform float geom_overscan_y;
uniform float border_size;
uniform float border_darkness;
uniform float border_compress;
uniform float interlace_bff;
uniform float interlace_1080i;
#else
// Use constants from user-settings.h, and limit ranges appropriately:
/* static const float crt_gamma = macro_max(0.0, crt_gamma_static);
static const float lcd_gamma = macro_max(0.0, lcd_gamma_static);
static const float levels_contrast = macro_clamp(levels_contrast_static, 0.0, 4.0);
static const float halation_weight = macro_clamp(halation_weight_static, 0.0, 1.0);
static const float diffusion_weight = macro_clamp(diffusion_weight_static, 0.0, 1.0);
static const float bloom_underestimate_levels = macro_max(FIX_ZERO(0.0), bloom_underestimate_levels_static);
static const float bloom_excess = macro_clamp(bloom_excess_static, 0.0, 1.0);
static const float beam_min_sigma = macro_max(FIX_ZERO(0.0), beam_min_sigma_static);
static const float beam_max_sigma = macro_max(beam_min_sigma, beam_max_sigma_static);
static const float beam_spot_power = macro_max(beam_spot_power_static, 0.0);
static const float beam_min_shape = macro_max(2.0, beam_min_shape_static);
static const float beam_max_shape = macro_max(beam_min_shape, beam_max_shape_static);
static const float beam_shape_power = macro_max(0.0, beam_shape_power_static);
static const float beam_horiz_filter = macro_clamp(beam_horiz_filter_static, 0.0, 2.0);
static const float beam_horiz_sigma = macro_max(FIX_ZERO(0.0), beam_horiz_sigma_static);
static const float beam_horiz_linear_rgb_weight = macro_clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
*/ // Unpack static vector elements to match scalar uniforms:
/* static const float convergence_offset_x_r = macro_clamp(convergence_offsets_r_static.x, -4.0, 4.0);
static const float convergence_offset_x_g = macro_clamp(convergence_offsets_g_static.x, -4.0, 4.0);
static const float convergence_offset_x_b = macro_clamp(convergence_offsets_b_static.x, -4.0, 4.0);
static const float convergence_offset_y_r = macro_clamp(convergence_offsets_r_static.y, -4.0, 4.0);
static const float convergence_offset_y_g = macro_clamp(convergence_offsets_g_static.y, -4.0, 4.0);
static const float convergence_offset_y_b = macro_clamp(convergence_offsets_b_static.y, -4.0, 4.0);
static const float mask_type = macro_clamp(mask_type_static, 0.0, 2.0);
static const float mask_sample_mode_desired = macro_clamp(mask_sample_mode_static, 0.0, 2.0);
static const float mask_specify_num_triads = macro_clamp(mask_specify_num_triads_static, 0.0, 1.0);
static const float mask_triad_size_desired = macro_clamp(mask_triad_size_desired_static, 1.0, 18.0);
static const float mask_num_triads_desired = macro_clamp(mask_num_triads_desired_static, 342.0, 1920.0);
static const float aa_subpixel_r_offset_x_runtime = macro_clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5);
static const float aa_subpixel_r_offset_y_runtime = macro_clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5);
static const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]?
static const float aa_gauss_sigma = macro_max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]?
static const float geom_mode_runtime = macro_clamp(geom_mode_static, 0.0, 3.0);
static const float geom_radius = macro_max(1.0/(2.0*pi), geom_radius_static); // Clamp to [1/(2*pi), 1024]?
static const float geom_view_dist = macro_max(0.5, geom_view_dist_static); // Clamp to [0.5, 1024]?
static const float geom_tilt_angle_x = macro_clamp(geom_tilt_angle_static.x, -pi, pi);
static const float geom_tilt_angle_y = macro_clamp(geom_tilt_angle_static.y, -pi, pi);
static const float geom_aspect_ratio_x = geom_aspect_ratio_static; // Force >= 1?
static const float geom_aspect_ratio_y = 1.0;
static const float geom_overscan_x = macro_max(FIX_ZERO(0.0), geom_overscan_static.x);
static const float geom_overscan_y = macro_max(FIX_ZERO(0.0), geom_overscan_static.y);
static const float border_size = macro_clamp(border_size_static, 0.0, 0.5); // 0.5 reaches to image center
static const float border_darkness = macro_max(0.0, border_darkness_static);
static const float border_compress = macro_max(1.0, border_compress_static); // < 1.0 darkens whole image
static const float interlace_bff = float(interlace_bff_static);
static const float interlace_1080i = float(interlace_1080i_static);
*/
#endif
/*
// Provide accessors for vector constants that pack scalar uniforms:
float2 get_aspect_vector(const float geom_aspect_ratio)
{
// Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent
// the absolute scale from affecting the uv-mapping for curvature:
const float geom_clamped_aspect_ratio =
min(geom_aspect_ratio, geom_max_aspect_ratio);
const float2 geom_aspect =
normalize(float2(geom_clamped_aspect_ratio, 1.0));
return geom_aspect;
}
float2 get_geom_overscan_vector()
{
return float2(geom_overscan_x, geom_overscan_y);
}
float2 get_geom_tilt_angle_vector()
{
return float2(geom_tilt_angle_x, geom_tilt_angle_y);
}
*/
float3 get_convergence_offsets_x_vector()
{
return float3(convergence_offset_x_r, convergence_offset_x_g,
convergence_offset_x_b);
}
float3 get_convergence_offsets_y_vector()
{
return float3(convergence_offset_y_r, convergence_offset_y_g,
convergence_offset_y_b);
}
float2 get_convergence_offsets_r_vector()
{
return float2(convergence_offset_x_r, convergence_offset_y_r);
}
float2 get_convergence_offsets_g_vector()
{
return float2(convergence_offset_x_g, convergence_offset_y_g);
}
float2 get_convergence_offsets_b_vector()
{
return float2(convergence_offset_x_b, convergence_offset_y_b);
}
/*
float2 get_aa_subpixel_r_offset()
{
#ifdef RUNTIME_ANTIALIAS_WEIGHTS
#ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// WARNING: THIS IS EXTREMELY EXPENSIVE.
return float2(aa_subpixel_r_offset_x_runtime,
aa_subpixel_r_offset_y_runtime);
#else
return aa_subpixel_r_offset_static;
#endif
#else
return aa_subpixel_r_offset_static;
#endif
}
*/
// Provide accessors settings which still need "cooking:"
float get_mask_amplify()
{
static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
return mask_type < 0.5 ? mask_grille_amplify :
mask_type < 1.5 ? mask_slot_amplify :
mask_shadow_amplify;
}
float get_mask_sample_mode()
{
#ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
return mask_sample_mode_desired;
#else
return clamp(mask_sample_mode_desired, 1.0, 2.0);
#endif
#else
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
return mask_sample_mode_static;
#else
return clamp(mask_sample_mode_static, 1.0, 2.0);
#endif
#endif
}
#endif // BIND_SHADER_PARAMS_H

View file

@ -0,0 +1,317 @@
#ifndef BLOOM_FUNCTIONS_H
#define BLOOM_FUNCTIONS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////////// DESCRIPTION ////////////////////////////////
// These utility functions and constants help several passes determine the
// size and center texel weight of the phosphor bloom in a uniform manner.
////////////////////////////////// INCLUDES //////////////////////////////////
// We need to calculate the correct blur sigma using some .cgp constants:
#include "user-settings.fxh"
#include "derived-settings-and-constants.fxh"
#include "blur-functions.fxh"
/////////////////////////////// BLOOM CONSTANTS //////////////////////////////
// Compute constants with manual inlines of the functions below:
static const float bloom_diff_thresh = 1.0/256.0;
/////////////////////////////////// HELPERS //////////////////////////////////
float get_min_sigma_to_blur_triad(const float triad_size,
const float thresh)
{
// Requires: 1.) triad_size is the final phosphor triad size in pixels
// 2.) thresh is the max desired pixel difference in the
// blurred triad (e.g. 1.0/256.0).
// Returns: Return the minimum sigma that will fully blur a phosphor
// triad on the screen to an even color, within thresh.
// This closed-form function was found by curve-fitting data.
// Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
return -0.05168 + 0.6113*triad_size -
1.122*triad_size*sqrt(0.000416 + thresh);
// Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
//return 0.5985*triad_size - triad_size*sqrt(thresh)
}
float get_absolute_scale_blur_sigma(const float thresh)
{
// Requires: 1.) min_expected_triads must be a global float. The number
// of horizontal phosphor triads in the final image must be
// >= min_allowed_viewport_triads.x for realistic results.
// 2.) bloom_approx_scale_x must be a global float equal to the
// absolute horizontal scale of BLOOM_APPROX.
// 3.) bloom_approx_scale_x/min_allowed_viewport_triads.x
// should be <= 1.1658025090 to keep the final result <
// 0.62666015625 (the largest sigma ensuring the largest
// unused texel weight stays < 1.0/256.0 for a 3x3 blur).
// 4.) thresh is the max desired pixel difference in the
// blurred triad (e.g. 1.0/256.0).
// Returns: Return the minimum Gaussian sigma that will blur the pass
// output as much as it would have taken to blur away
// bloom_approx_scale_x horizontal phosphor triads.
// Description:
// BLOOM_APPROX should look like a downscaled phosphor blur. Ideally, we'd
// use the same blur sigma as the actual phosphor bloom and scale it down
// to the current resolution with (bloom_approx_scale_x/viewport_size_x), but
// we don't know the viewport size in this pass. Instead, we'll blur as
// much as it would take to blur away min_allowed_viewport_triads.x. This
// will blur "more than necessary" if the user actually uses more triads,
// but that's not terrible either, because blurring a constant fraction of
// the viewport may better resemble a true optical bloom anyway (since the
// viewport will generally be about the same fraction of each player's
// field of view, regardless of screen size and resolution).
// Assume an extremely large viewport size for asymptotic results.
return bloom_approx_scale_x/max_viewport_size_x *
get_min_sigma_to_blur_triad(
max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
}
float get_center_weight(const float sigma)
{
// Given a Gaussian blur sigma, get the blur weight for the center texel.
#ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
return get_fast_gaussian_weight_sum_inv(sigma);
#else
const float denom_inv = 0.5/(sigma*sigma);
const float w0 = 1.0;
const float w1 = exp(-1.0 * denom_inv);
const float w2 = exp(-4.0 * denom_inv);
const float w3 = exp(-9.0 * denom_inv);
const float w4 = exp(-16.0 * denom_inv);
const float w5 = exp(-25.0 * denom_inv);
const float w6 = exp(-36.0 * denom_inv);
const float w7 = exp(-49.0 * denom_inv);
const float w8 = exp(-64.0 * denom_inv);
const float w9 = exp(-81.0 * denom_inv);
const float w10 = exp(-100.0 * denom_inv);
const float w11 = exp(-121.0 * denom_inv);
const float w12 = exp(-144.0 * denom_inv);
const float w13 = exp(-169.0 * denom_inv);
const float w14 = exp(-196.0 * denom_inv);
const float w15 = exp(-225.0 * denom_inv);
const float w16 = exp(-256.0 * denom_inv);
const float w17 = exp(-289.0 * denom_inv);
const float w18 = exp(-324.0 * denom_inv);
const float w19 = exp(-361.0 * denom_inv);
const float w20 = exp(-400.0 * denom_inv);
const float w21 = exp(-441.0 * denom_inv);
// Note: If the implementation uses a smaller blur than the max allowed,
// the worst case scenario is that the center weight will be overestimated,
// so we'll put a bit more energy into the brightpass...no huge deal.
// Then again, if the implementation uses a larger blur than the max
// "allowed" because of dynamic branching, the center weight could be
// underestimated, which is more of a problem...consider always using
#ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// 43x blur:
const float weight_sum_inv = 1.0 /
(w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 +
w11 + w12 + w13 + w14 + w15 + w16 + w17 + w18 + w19 + w20 + w21));
#else
#ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
// 31x blur:
const float weight_sum_inv = 1.0 /
(w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 +
w8 + w9 + w10 + w11 + w12 + w13 + w14 + w15));
#else
#ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
// 25x blur:
const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
#else
#ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
// 17x blur:
const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
#else
// 9x blur:
const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
#endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
#endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
#endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
#endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
const float center_weight = weight_sum_inv * weight_sum_inv;
return center_weight;
#endif
}
float3 tex2DblurNfast(const sampler2D tex, const float2 tex_uv,
const float2 dxdy, const float sigma)
{
// If sigma is static, we can safely branch and use the smallest blur
// that's big enough. Ignore #define hints, because we'll only use a
// large blur if we actually need it, and the branches cost nothing.
#ifndef RUNTIME_PHOSPHOR_BLOOM_SIGMA
#define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
#else
// It's still worth branching if the profile supports dynamic branches:
// It's much faster than using a hugely excessive blur, but each branch
// eats ~1% FPS.
#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
#define PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
#endif
#endif
// Failed optimization notes:
// I originally created a same-size mipmapped 5-tap separable blur10 that
// could handle any sigma by reaching into lower mip levels. It was
// as fast as blur25fast for runtime sigmas and a tad faster than
// blur31fast for static sigmas, but mipmapping two viewport-size passes
// ate 10% of FPS across all codepaths, so it wasn't worth it.
#ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
if(sigma <= blur9_std_dev)
{
return tex2Dblur9fast(tex, tex_uv, dxdy, sigma);
}
else if(sigma <= blur17_std_dev)
{
return tex2Dblur17fast(tex, tex_uv, dxdy, sigma);
}
else if(sigma <= blur25_std_dev)
{
return tex2Dblur25fast(tex, tex_uv, dxdy, sigma);
}
else if(sigma <= blur31_std_dev)
{
return tex2Dblur31fast(tex, tex_uv, dxdy, sigma);
}
else
{
return tex2Dblur43fast(tex, tex_uv, dxdy, sigma);
}
#else
// If we can't afford to branch, we can only guess at what blur
// size we need. Therefore, use the largest blur allowed.
#ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
return tex2Dblur43fast(tex, tex_uv, dxdy, sigma);
#else
#ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
return tex2Dblur31fast(tex, tex_uv, dxdy, sigma);
#else
#ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
return tex2Dblur25fast(tex, tex_uv, dxdy, sigma);
#else
#ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
return tex2Dblur17fast(tex, tex_uv, dxdy, sigma);
#else
return tex2Dblur9fast(tex, tex_uv, dxdy, sigma);
#endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
#endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
#endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
#endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
#endif // PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
}
float get_bloom_approx_sigma(const float output_size_x_runtime,
const float estimated_viewport_size_x)
{
// Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
// This is included for dynamic codepaths just in case the
// following two globals are incorrect:
// 2.) bloom_approx_size_x_for_skip should == the same
// if PHOSPHOR_BLOOM_FAKE is #defined
// 3.) bloom_approx_size_x should == the same otherwise
// Returns: For gaussian4x4, return a dynamic small bloom sigma that's
// as close to optimal as possible given available information.
// For blur3x3, return the a static small bloom sigma that
// works well for typical cases. Otherwise, we're using simple
// bilinear filtering, so use static calculations.
// Assume the default static value. This is a compromise that ensures
// typical triads are blurred, even if unusually large ones aren't.
static const float mask_num_triads_static =
max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
const float mask_num_triads_from_size =
estimated_viewport_size_x/mask_triad_size_desired;
const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x,
lerp(mask_num_triads_from_size, mask_num_triads_desired,
mask_specify_num_triads));
// Assume an extremely large viewport size for asymptotic results:
static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize
{
// Use the runtime num triads and output size:
const float asymptotic_triad_size =
max_viewport_size_x/mask_num_triads_runtime;
const float asymptotic_sigma = get_min_sigma_to_blur_triad(
asymptotic_triad_size, bloom_diff_thresh);
const float bloom_approx_sigma =
asymptotic_sigma * output_size_x_runtime/max_viewport_size_x;
// The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
// account for the Gaussian scanline sigma from the last pass too.
// The bloom will be too wide horizontally but tall enough vertically.
return length(float2(bloom_approx_sigma, beam_max_sigma));
}
else // 3x3 blur resize (the bilinear resize doesn't need a sigma)
{
// We're either using blur3x3 or bilinear filtering. The biggest
// reason to choose blur3x3 is to avoid dynamic weights, so use a
// static calculation.
#ifdef PHOSPHOR_BLOOM_FAKE
static const float output_size_x_static =
bloom_approx_size_x_for_fake;
#else
static const float output_size_x_static = bloom_approx_size_x;
#endif
static const float asymptotic_triad_size =
max_viewport_size_x/mask_num_triads_static;
const float asymptotic_sigma = get_min_sigma_to_blur_triad(
asymptotic_triad_size, bloom_diff_thresh);
const float bloom_approx_sigma =
asymptotic_sigma * output_size_x_static/max_viewport_size_x;
// The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
// try accounting for the Gaussian scanline sigma from the last pass
// too; use the static default value:
return length(float2(bloom_approx_sigma, beam_max_sigma_static));
}
}
float get_final_bloom_sigma(const float bloom_sigma_runtime)
{
// Requires: 1.) bloom_sigma_runtime is a precalculated sigma that's
// optimal for the [known] triad size.
// 2.) Call this from a fragment shader (not a vertex shader),
// or blurring with static sigmas won't be constant-folded.
// Returns: Return the optimistic static sigma if the triad size is
// known at compile time. Otherwise return the optimal runtime
// sigma (10% slower) or an implementation-specific compromise
// between an optimistic or pessimistic static sigma.
// Notes: Call this from the fragment shader, NOT the vertex shader,
// so static sigmas can be constant-folded!
const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad(
mask_triad_size_desired_static, bloom_diff_thresh);
#ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
return bloom_sigma_runtime;
#else
// Overblurring looks as bad as underblurring, so assume average-size
// triads, not worst-case huge triads:
return bloom_sigma_optimistic;
#endif
}
#endif // BLOOM_FUNCTIONS_H

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,299 @@
#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
#define DERIVED_SETTINGS_AND_CONSTANTS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////////// DESCRIPTION ////////////////////////////////
// These macros and constants can be used across the whole codebase.
// Unlike the values in user-settings.cgh, end users shouldn't modify these.
////////////////////////////////// INCLUDES //////////////////////////////////
#include "user-settings.fxh"
#include "user-cgp-constants.fxh"
/////////////////////////////// FIXED SETTINGS ///////////////////////////////
// Avoid dividing by zero; using a macro overloads for float, float2, etc.:
//#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16
// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
#ifndef SIMULATE_CRT_ON_LCD
#define SIMULATE_CRT_ON_LCD
#endif
// Manually tiling a manually resized texture creates texture coord derivative
// discontinuities and confuses anisotropic filtering, causing discolored tile
// seams in the phosphor mask. Workarounds:
// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's
// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
// b.) "Tile flat twice" requires drawing two full tiles without border padding
// to the resized mask FBO, and it's incompatible with same-pass curvature.
// (Same-pass curvature isn't used but could be in the future...maybe.)
// c.) "Fix discontinuities" requires derivatives and drawing one tile with
// border padding to the resized mask FBO, but it works with same-pass
// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
// Precedence: a, then, b, then c (if multiple strategies are #defined).
#define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen
#define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen
#define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen
// Also, manually resampling the phosphor mask is slightly blurrier with
// anisotropic filtering. (Resampling with mipmapping is even worse: It
// creates artifacts, but only with the fully bloomed shader.) The difference
// is subtle with small triads, but you can fix it for a small cost.
//#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
////////////////////////////// DERIVED SETTINGS //////////////////////////////
// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable
// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
// #defined by either user-settings.h or a wrapper .cg that #includes the
// current .cg pass.)
#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
#undef PHOSPHOR_MASK_MANUALLY_RESIZE
#endif
#ifdef RUNTIME_GEOMETRY_MODE
#undef RUNTIME_GEOMETRY_MODE
#endif
// Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
// inferior in most cases, so replace 2.0 with 0.0:
static const float bloom_approx_filter =
bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
#else
static const float bloom_approx_filter = bloom_approx_filter_static;
#endif
// Disable slow runtime paths if static parameters are used. Most of these
// won't be a problem anyway once the params are disabled, but some will.
#ifndef RUNTIME_SHADER_PARAMS_ENABLE
#ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
#undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
#endif
#ifdef RUNTIME_ANTIALIAS_WEIGHTS
#undef RUNTIME_ANTIALIAS_WEIGHTS
#endif
#ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
#undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
#endif
#ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#endif
#ifdef RUNTIME_GEOMETRY_TILT
#undef RUNTIME_GEOMETRY_TILT
#endif
#ifdef RUNTIME_GEOMETRY_MODE
#undef RUNTIME_GEOMETRY_MODE
#endif
#ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#endif
#endif
// Make tex2Dbias a backup for tex2Dlod for wider compatibility.
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#endif
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#endif
// Rule out unavailable anisotropic compatibility strategies:
#ifndef DRIVERS_ALLOW_DERIVATIVES
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#endif
#endif
#ifndef DRIVERS_ALLOW_TEX2DLOD
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#endif
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
#undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
#endif
#ifdef ANTIALIAS_DISABLE_ANISOTROPIC
#undef ANTIALIAS_DISABLE_ANISOTROPIC
#endif
#endif
#ifndef DRIVERS_ALLOW_TEX2DBIAS
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#endif
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#endif
#endif
// Prioritize anisotropic tiling compatibility strategies by performance and
// disable unused strategies. This concentrates all the nesting in one place.
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#endif
#else
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#endif
#else
// ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
// flat texture coords in the same pass, but that's all we use.
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#endif
#endif
#endif
#endif
// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
// reduce some #ifdef nesting in the next section by essentially OR'ing them:
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
#endif
// Prioritize anisotropic resampling compatibility strategies the same way:
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#endif
#endif
/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS //////////////////////
// If we can use the large mipmapped LUT without mipmapping artifacts, we
// should: It gives us more options for using fewer samples.
#ifdef DRIVERS_ALLOW_TEX2DLOD
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
// TODO: Take advantage of this!
#define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
static const float2 mask_resize_src_lut_size = mask_texture_large_size;
#else
static const float2 mask_resize_src_lut_size = mask_texture_small_size;
#endif
#else
static const float2 mask_resize_src_lut_size = mask_texture_small_size;
#endif
// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
// main_fragment, or a static alias of one of the above. This makes it hard
// to select the phosphor mask at runtime: We can't even assign to a uniform
// global in the vertex shader or select a sampler2D in the vertex shader and
// pass it to the fragment shader (even with explicit TEXUNIT# bindings),
// because it just gives us the input texture or a black screen. However, we
// can get around these limitations by calling tex2D three times with different
// uniform samplers (or resizing the phosphor mask three times altogether).
// With dynamic branches, we can process only one of these branches on top of
// quickly discarding fragments we don't need (cgc seems able to overcome
// limigations around dependent texture fetches inside of branches). Without
// dynamic branches, we have to process every branch for every fragment...which
// is slower. Runtime sampling mode selection is slower without dynamic
// branches as well. Let the user's static #defines decide if it's worth it.
#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
#define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#else
#ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#endif
#endif
// We need to render some minimum number of tiles in the resize passes.
// We need at least 1.0 just to repeat a single tile, and we need extra
// padding beyond that for anisotropic filtering, discontinuitity fixing,
// antialiasing, same-pass curvature (not currently used), etc. First
// determine how many border texels and tiles we need, based on how the result
// will be sampled:
#ifdef GEOMETRY_EARLY
static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
// Most antialiasing filters have a base radius of 4.0 pixels:
static const float max_aa_base_pixel_border = 4.0 +
max_subpixel_offset;
#else
static const float max_aa_base_pixel_border = 0.0;
#endif
// Anisotropic filtering adds about 0.5 to the pixel border:
#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
#else
static const float max_aniso_pixel_border = max_aa_base_pixel_border;
#endif
// Fixing discontinuities adds 1.0 more to the pixel border:
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
#else
static const float max_tiled_pixel_border = max_aniso_pixel_border;
#endif
// Convert the pixel border to an integer texel border. Assume same-pass
// curvature about triples the texel frequency:
#ifdef GEOMETRY_EARLY
static const float max_mask_texel_border =
macro_ceil(max_tiled_pixel_border * 3.0);
#else
static const float max_mask_texel_border = macro_ceil(max_tiled_pixel_border);
#endif
// Convert the texel border to a tile border using worst-case assumptions:
static const float max_mask_tile_border = max_mask_texel_border/
(mask_min_allowed_triad_size * mask_triads_per_tile);
// Finally, set the number of resized tiles to render to MASK_RESIZE, and set
// the starting texel (inside borders) for sampling it.
#ifndef GEOMETRY_EARLY
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
// Special case: Render two tiles without borders. Anisotropic
// filtering doesn't seem to be a problem here.
static const float mask_resize_num_tiles = 1.0 + 1.0;
static const float mask_start_texels = 0.0;
#else
static const float mask_resize_num_tiles = 1.0 +
2.0 * max_mask_tile_border;
static const float mask_start_texels = max_mask_texel_border;
#endif
#else
static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
static const float mask_start_texels = max_mask_texel_border;
#endif
// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
// mask_resize_viewport_scale. This limits the maximum final triad size.
// Estimate the minimum number of triads we can split the screen into in each
// dimension (we'll be as correct as mask_resize_viewport_scale is):
static const float mask_resize_num_triads =
mask_resize_num_tiles * mask_triads_per_tile;
static const float2 min_allowed_viewport_triads =
mask_resize_num_triads.xx / mask_resize_viewport_scale;
#endif // DERIVED_SETTINGS_AND_CONSTANTS_H

View file

@ -0,0 +1,545 @@
#ifndef GAMMA_MANAGEMENT_H
#define GAMMA_MANAGEMENT_H
///////////////////////////////// MIT LICENSE ////////////////////////////////
// Copyright (C) 2014 TroggleMonkey
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
///////////////////////////////// DESCRIPTION ////////////////////////////////
// This file provides gamma-aware tex*D*() and encode_output() functions.
// Requires: Before #include-ing this file, the including file must #define
// the following macros when applicable and follow their rules:
// 1.) #define FIRST_PASS if this is the first pass.
// 2.) #define LAST_PASS if this is the last pass.
// 3.) If sRGB is available, set srgb_framebufferN = "true" for
// every pass except the last in your .cgp preset.
// 4.) If sRGB isn't available but you want gamma-correctness with
// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
// If an option in [5, 8] is #defined in the first or last pass, it
// should be #defined for both. It shouldn't make a difference
// whether it's #defined for intermediate passes or not.
// Optional: The including file (or an earlier included file) may optionally
// #define a number of macros indicating it will override certain
// macros and associated constants are as follows:
// static constants with either static or uniform constants. The
// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
// static const float ntsc_gamma
// static const float pal_gamma
// static const float crt_reference_gamma_high
// static const float crt_reference_gamma_low
// static const float lcd_reference_gamma
// static const float crt_office_gamma
// static const float lcd_office_gamma
// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
// static const float crt_gamma
// static const float gba_gamma
// static const float lcd_gamma
// 3.) OVERRIDE_FINAL_GAMMA: The user must first define:
// static const float input_gamma
// static const float intermediate_gamma
// static const float output_gamma
// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
// static const bool assume_opaque_alpha
// The gamma constant overrides must be used in every pass or none,
// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
// Usage: After setting macros appropriately, ignore gamma correction and
// replace all tex*D*() calls with equivalent gamma-aware
// tex*D*_linearize calls, except:
// 1.) When you read an LUT, use regular tex*D or a gamma-specified
// function, depending on its gamma encoding:
// tex*D*_linearize_gamma (takes a runtime gamma parameter)
// 2.) If you must read pass0's original input in a later pass, use
// tex2D_linearize_ntsc_gamma. If you want to read pass0's
// input with gamma-corrected bilinear filtering, consider
// creating a first linearizing pass and reading from the input
// of pass1 later.
// Then, return encode_output(color) from every fragment shader.
// Finally, use the global gamma_aware_bilinear boolean if you want
// to statically branch based on whether bilinear filtering is
// gamma-correct or not (e.g. for placing Gaussian blur samples).
//
// Detailed Policy:
// tex*D*_linearize() functions enforce a consistent gamma-management policy
// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume
// their input texture has the same encoding characteristics as the input for
// the current pass (which doesn't apply to the exceptions listed above).
// Similarly, encode_output() enforces a policy based on the LAST_PASS and
// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the
// following two pipelines.
// Typical pipeline with intermediate sRGB framebuffers:
// linear_color = pow(pass0_encoded_color, input_gamma);
// intermediate_output = linear_color; // Automatic sRGB encoding
// linear_color = intermediate_output; // Automatic sRGB decoding
// final_output = pow(intermediate_output, 1.0/output_gamma);
// Typical pipeline without intermediate sRGB framebuffers:
// linear_color = pow(pass0_encoded_color, input_gamma);
// intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
// linear_color = pow(intermediate_output, intermediate_gamma);
// final_output = pow(intermediate_output, 1.0/output_gamma);
// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
// easily get gamma-correctness without banding on devices where sRGB isn't
// supported.
//
// Use This Header to Maximize Code Reuse:
// The purpose of this header is to provide a consistent interface for texture
// reads and output gamma-encoding that localizes and abstracts away all the
// annoying details. This greatly reduces the amount of code in each shader
// pass that depends on the pass number in the .cgp preset or whether sRGB
// FBO's are being used: You can trivially change the gamma behavior of your
// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same
// code in your first, Nth, and last passes, you can even put it all in another
// header file and #include it from skeleton .cg files that #define the
// appropriate pass-specific settings.
//
// Rationale for Using Three Macros:
// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
// a lower maintenance burden on each pass. At first glance it seems we could
// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
// This works for simple use cases where input_gamma == output_gamma, but it
// breaks down for more complex scenarios like CRT simulation, where the pass
// number determines the gamma encoding of the input and output.
/////////////////////////////// BASE CONSTANTS ///////////////////////////////
// Set standard gamma constants, but allow users to override them:
#ifndef OVERRIDE_STANDARD_GAMMA
// Standard encoding gammas:
static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too?
static const float pal_gamma = 2.8; // Never actually 2.8 in practice
// Typical device decoding gammas (only use for emulating devices):
// CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
// gammas: The standards purposely undercorrected for an analog CRT's
// assumed 2.5 reference display gamma to maintain contrast in assumed
// [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
// These unstated assumptions about display gamma and perceptual rendering
// intent caused a lot of confusion, and more modern CRT's seemed to target
// NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit
// (they struggle near black with 2.5 gamma anyway), especially PC/laptop
// displays designed to view sRGB in bright environments. (Standards are
// also in flux again with BT.1886, but it's underspecified for displays.)
static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55)
static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55)
static const float lcd_reference_gamma = 2.5; // To match CRT
static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC
static const float lcd_office_gamma = 2.2; // Approximates sRGB
#endif // OVERRIDE_STANDARD_GAMMA
// Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
// but only if they're aware of it.
#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
static const bool assume_opaque_alpha = false;
#endif
/////////////////////// DERIVED CONSTANTS AS FUNCTIONS ///////////////////////
// gamma-management.h should be compatible with overriding gamma values with
// runtime user parameters, but we can only define other global constants in
// terms of static constants, not uniform user parameters. To get around this
// limitation, we need to define derived constants using functions.
// Set device gamma constants, but allow users to override them:
#ifdef OVERRIDE_DEVICE_GAMMA
// The user promises to globally define the appropriate constants:
float get_crt_gamma() { return crt_gamma; }
float get_gba_gamma() { return gba_gamma; }
float get_lcd_gamma() { return lcd_gamma; }
#else
float get_crt_gamma() { return crt_reference_gamma_high; }
float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0)
float get_lcd_gamma() { return lcd_office_gamma; }
#endif // OVERRIDE_DEVICE_GAMMA
// Set decoding/encoding gammas for the first/lass passes, but allow overrides:
#ifdef OVERRIDE_FINAL_GAMMA
// The user promises to globally define the appropriate constants:
float get_intermediate_gamma() { return intermediate_gamma; }
float get_input_gamma() { return input_gamma; }
float get_output_gamma() { return output_gamma; }
#else
// If we gamma-correct every pass, always use ntsc_gamma between passes to
// ensure middle passes don't need to care if anything is being simulated:
float get_intermediate_gamma() { return ntsc_gamma; }
#ifdef SIMULATE_CRT_ON_LCD
float get_input_gamma() { return get_crt_gamma(); }
float get_output_gamma() { return get_lcd_gamma(); }
#else
#ifdef SIMULATE_GBA_ON_LCD
float get_input_gamma() { return get_gba_gamma(); }
float get_output_gamma() { return get_lcd_gamma(); }
#else
#ifdef SIMULATE_LCD_ON_CRT
float get_input_gamma() { return get_lcd_gamma(); }
float get_output_gamma() { return get_crt_gamma(); }
#else
#ifdef SIMULATE_GBA_ON_CRT
float get_input_gamma() { return get_gba_gamma(); }
float get_output_gamma() { return get_crt_gamma(); }
#else // Don't simulate anything:
float get_input_gamma() { return ntsc_gamma; }
float get_output_gamma() { return ntsc_gamma; }
#endif // SIMULATE_GBA_ON_CRT
#endif // SIMULATE_LCD_ON_CRT
#endif // SIMULATE_GBA_ON_LCD
#endif // SIMULATE_CRT_ON_LCD
#endif // OVERRIDE_FINAL_GAMMA
// Set decoding/encoding gammas for the current pass. Use static constants for
// linearize_input and gamma_encode_output, because they aren't derived, and
// they let the compiler do dead-code elimination.
#ifndef GAMMA_ENCODE_EVERY_FBO
#ifdef FIRST_PASS
static const bool linearize_input = true;
float get_pass_input_gamma() { return get_input_gamma(); }
#else
static const bool linearize_input = false;
float get_pass_input_gamma() { return 1.0; }
#endif
#ifdef LAST_PASS
static const bool gamma_encode_output = true;
float get_pass_output_gamma() { return get_output_gamma(); }
#else
static const bool gamma_encode_output = false;
float get_pass_output_gamma() { return 1.0; }
#endif
#else
static const bool linearize_input = true;
static const bool gamma_encode_output = true;
#ifdef FIRST_PASS
float get_pass_input_gamma() { return get_input_gamma(); }
#else
float get_pass_input_gamma() { return get_intermediate_gamma(); }
#endif
#ifdef LAST_PASS
float get_pass_output_gamma() { return get_output_gamma(); }
#else
float get_pass_output_gamma() { return get_intermediate_gamma(); }
#endif
#endif
// Users might want to know if bilinear filtering will be gamma-correct:
static const bool gamma_aware_bilinear = !linearize_input;
////////////////////// COLOR ENCODING/DECODING FUNCTIONS /////////////////////
float4 encode_output(const float4 color)
{
if(gamma_encode_output)
{
if(assume_opaque_alpha)
{
return float4(pow(color.rgb, 1.0/get_pass_output_gamma()), 1.0);
}
else
{
return float4(pow(color.rgb, 1.0/get_pass_output_gamma()), color.a);
}
}
else
{
return color;
}
}
float4 decode_input(const float4 color)
{
return color;
}
float4 decode_input_first(const float4 color)
{
if(assume_opaque_alpha)
{
return float4(pow(color.rgb, get_input_gamma()), 1.0);
}
else
{
return float4(pow(color.rgb, get_input_gamma()), color.a);
}
}
float4 decode_gamma_input(const float4 color, const float3 gamma)
{
if(assume_opaque_alpha)
{
return float4(pow(color.rgb, gamma), 1.0);
}
else
{
return float4(pow(color.rgb, gamma), color.a);
}
}
/////////////////////////// TEXTURE LOOKUP WRAPPERS //////////////////////////
// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
// Provide a wide array of linearizing texture lookup wrapper functions. The
// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
// lookups are provided for completeness in case that changes someday. Nobody
// is likely to use the *fetch and *proj functions, but they're included just
// in case. The only tex*D texture sampling functions omitted are:
// - tex*Dcmpbias
// - tex*Dcmplod
// - tex*DARRAY*
// - tex*DMS*
// - Variants returning integers
// Standard line length restrictions are ignored below for vertical brevity.
/*
// tex1D:
float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
{ return decode_input(tex1D(tex, tex_coords)); }
float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
{ return decode_input(tex1D(tex, tex_coords)); }
float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
{ return decode_input(tex1D(tex, tex_coords, texel_off)); }
float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
{ return decode_input(tex1D(tex, tex_coords, texel_off)); }
float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
{ return decode_input(tex1D(tex, tex_coords, dx, dy)); }
float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
{ return decode_input(tex1D(tex, tex_coords, dx, dy)); }
float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); }
// tex1Dbias:
float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
{ return decode_input(tex1Dbias(tex, tex_coords)); }
float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); }
// tex1Dfetch:
float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
{ return decode_input(tex1Dfetch(tex, tex_coords)); }
float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); }
// tex1Dlod:
float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
{ return decode_input(tex1Dlod(tex, tex_coords)); }
float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); }
// tex1Dproj:
float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
{ return decode_input(tex1Dproj(tex, tex_coords)); }
float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
{ return decode_input(tex1Dproj(tex, tex_coords)); }
float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); }
*/
// tex2D:
float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords)
{ return decode_input(tex2D(tex, tex_coords)); }
float4 tex2D_linearize_first(const sampler2D tex, const float2 tex_coords)
{ return decode_input_first(tex2D(tex, tex_coords)); }
float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords)
{ return decode_input(tex2D(tex, tex_coords.xy)); }
//float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
//{ return decode_input(tex2D(tex, tex_coords, texel_off)); }
//float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
//{ return decode_input(tex2D(tex, tex_coords.xy, texel_off)); }
/*
float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
{ return decode_input(tex2D(tex, tex_coords, dx, dy)); }
float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
{ return decode_input(tex2D(tex, tex_coords, dx, dy)); }
float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
{ return decode_input(tex2D(tex, tex_coords, dx, dy, texel_off)); }
float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
{ return decode_input(tex2D(tex, tex_coords, dx, dy, texel_off)); }
// tex2Dbias:
float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
{ return decode_input(tex2Dbias(tex, tex_coords)); }
float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); }
// tex2Dfetch:
float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
{ return decode_input(tex2Dfetch(tex, tex_coords)); }
float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); }
*/
// tex2Dlod:
float4 tex2Dlod_linearize(const sampler2D tex, const float4 tex_coords)
{ return decode_input(tex2Dlod(tex, tex_coords)); }
//float4 tex2Dlod_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
//{ return decode_input(tex2Dlod(tex, tex_coords, texel_off)); }
/*
// tex2Dproj:
float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
{ return decode_input(tex2Dproj(tex, tex_coords)); }
float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
{ return decode_input(tex2Dproj(tex, tex_coords)); }
float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); }
// tex3D:
float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
{ return decode_input(tex3D(tex, tex_coords)); }
float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
{ return decode_input(tex3D(tex, tex_coords, texel_off)); }
float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
{ return decode_input(tex3D(tex, tex_coords, dx, dy)); }
float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); }
// tex3Dbias:
float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
{ return decode_input(tex3Dbias(tex, tex_coords)); }
float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); }
// tex3Dfetch:
float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
{ return decode_input(tex3Dfetch(tex, tex_coords)); }
float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); }
// tex3Dlod:
float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
{ return decode_input(tex3Dlod(tex, tex_coords)); }
float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); }
// tex3Dproj:
float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
{ return decode_input(tex3Dproj(tex, tex_coords)); }
float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); }
// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
// This narrow selection of nonstandard tex2D* functions can be useful:
// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); }
float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); }
// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
// Provide a narrower selection of tex2D* wrapper functions that decode an
// input sample with a specified gamma value. These are useful for reading
// LUT's and for reading the input of pass0 in a later pass.
// tex2D:
float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
{ return decode_gamma_input(tex2D(tex, tex_coords), gamma); }
float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
{ return decode_gamma_input(tex2D(tex, tex_coords), gamma); }
float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
{ return decode_gamma_input(tex2D(tex, tex_coords, texel_off), gamma); }
float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
{ return decode_gamma_input(tex2D(tex, tex_coords, texel_off), gamma); }
float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
{ return decode_gamma_input(tex2D(tex, tex_coords, dx, dy), gamma); }
float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
{ return decode_gamma_input(tex2D(tex, tex_coords, dx, dy), gamma); }
float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
{ return decode_gamma_input(tex2D(tex, tex_coords, dx, dy, texel_off), gamma); }
float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
{ return decode_gamma_input(tex2D(tex, tex_coords, dx, dy, texel_off), gamma); }
// tex2Dbias:
float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); }
float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); }
// tex2Dfetch:
float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); }
float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); }
*/
// tex2Dlod:
float4 tex2Dlod_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
{ return decode_gamma_input(tex2Dlod(tex, tex_coords), gamma); }
//float4 tex2Dlod_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
//{ return decode_gamma_input(tex2Dlod(tex, tex_coords, texel_off), gamma); }
#endif // GAMMA_MANAGEMENT_H

View file

@ -0,0 +1,76 @@
#ifndef _HELPER_FUNCTIONS_AND_MACROS_H
#define _HELPER_FUNCTIONS_AND_MACROS_H
///////////////////////////////// MIT LICENSE ////////////////////////////////
// Copyright (C) 2020 Alex Gunter
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
float4 tex2D_nograd(sampler2D tex, float2 tex_coords)
{
return tex2Dlod(tex, float4(tex_coords, 0, 0), 0.0);
}
// ReShade 4 does not permit the use of functions or the ternary operator
// outside of a function definition. This is a problem for this port
// because the original crt-royale shader makes heavy use of these
// constructs at the root level.
// These preprocessor definitions are a workaround for this limitation.
// Note that they are strictly intended for defining complex global
// constants. I doubt they're more performant than the built-in
// equivalents, so I recommend using the built-ins whenever you can.
#define macro_sign(c) -((int) ((c) != 0)) * -((int) ((c) > 0))
#define macro_abs(c) (c) * macro_sign(c)
#define macro_min(c, d) (c) * ((int) ((c) <= (d))) + (d) * ((int) ((c) > (d)))
#define macro_max(c, d) (c) * ((int) ((c) >= (d))) + (d) * ((int) ((c) < (d)))
#define macro_clamp(c, l, u) macro_min(macro_max(c, l), u)
#define macro_ceil(c) (float) ((int) (c) + (int) (((int) (c)) < (c)))
#define macro_cond(c, a, b) float(c) * (a) + float(!(c)) * (b)
//////////////////////// COMMON MATHEMATICAL CONSTANTS ///////////////////////
static const float pi = 3.141592653589;
// We often want to find the location of the previous texel, e.g.:
// const float2 curr_texel = uv * texture_size;
// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
// const float2 prev_texel_uv = prev_texel / texture_size;
// However, many GPU drivers round incorrectly around exact texel locations.
// We need to subtract a little less than 0.5 before flooring, and some GPU's
// require this value to be farther from 0.5 than others; define it here.
// const float2 prev_texel =
// floor(curr_texel - float2(under_half)) + float2(0.5);
static const float under_half = 0.4995;
// Avoid dividing by zero; using a macro overloads for float, float2, etc.:
#define FIX_ZERO(c) (macro_max(macro_abs(c), 0.0000152587890625)) // 2^-16
// #define fmod(x, y) ((x) - (y) * floor((x)/(y) + FIX_ZERO(0.0)))
#define fmod(x, y) (frac((x) / (y)) * (y))
#endif // _HELPER_FUNCTIONS_AND_MACROS_H

View file

@ -0,0 +1,676 @@
#ifndef PHOSPHOR_MASK_RESIZING_H
#define PHOSPHOR_MASK_RESIZING_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////// INCLUDES //////////////////////////////////
#include "user-settings.fxh"
#include "derived-settings-and-constants.fxh"
///////////////////////////// CODEPATH SELECTION /////////////////////////////
// Choose a looping strategy based on what's allowed:
// Dynamic loops not allowed: Use a flat static loop.
// Dynamic loops accomodated: Coarsely branch around static loops.
// Dynamic loops assumed allowed: Use a flat dynamic loop.
#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
#ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
#define BREAK_LOOPS_INTO_PIECES
#else
#define USE_SINGLE_STATIC_LOOP
#endif
#endif // No else needed: Dynamic loops assumed.
////////////////////////////////// CONSTANTS /////////////////////////////////
// The larger the resized tile, the fewer samples we'll need for downsizing.
// See if we can get a static min tile size > mask_min_allowed_tile_size:
static const float mask_min_allowed_tile_size = macro_ceil(
mask_min_allowed_triad_size * mask_triads_per_tile);
static const float mask_min_expected_tile_size =
mask_min_allowed_tile_size;
// Limit the number of sinc resize taps by the maximum minification factor:
static const float pi_over_lobes = pi/mask_sinc_lobes;
static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
mask_resize_src_lut_size.x/mask_min_expected_tile_size;
// Vectorized loops sample in multiples of 4. Round up to be safe:
static const float max_sinc_resize_samples_m4 = macro_ceil(
max_sinc_resize_samples_float * 0.25) * 4.0;
///////////////////////// RESAMPLING FUNCTION HELPERS ////////////////////////
float get_dynamic_loop_size(const float magnification_scale)
{
// Requires: The following global constants must be defined:
// 1.) mask_sinc_lobes
// 2.) max_sinc_resize_samples_m4
// Returns: The minimum number of texture samples for a correct downsize
// at magnification_scale.
// We're downsizing, so the filter is sized across 2*lobes output pixels
// (not 2*lobes input texels). This impacts distance measurements and the
// minimum number of input samples needed.
const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
const float max_samples_m4 = max_sinc_resize_samples_m4;
#else // ifdef BREAK_LOOPS_INTO_PIECES
// Simulating loops with branches imposes a 128-sample limit.
const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
#endif
return min(min_samples_m4, max_samples_m4);
}
float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv,
const float2 texture_size, const float dr,
const float input_tiles_per_texture_r, const float samples,
const bool vertical)
{
// Requires: 1.) dr == du == 1.0/texture_size.x or
// dr == dv == 1.0/texture_size.y
// (whichever direction we're resampling in).
// It's a scalar to save register space.
// 2.) input_tiles_per_texture_r is the number of input tiles
// that can fit in the input texture in the direction we're
// resampling this pass.
// 3.) vertical indicates whether we're resampling vertically
// this pass (or horizontally).
// Returns: Pack and return the first sample's tile_uv coord in [0, 1]
// and its texel distance from the destination pixel, in the
// resized dimension only.
// We'll start with the topmost or leftmost sample and work down or right,
// so get the first sample location and distance. Modify both dimensions
// as if we're doing a one-pass 2D resize; we'll throw away the unneeded
// (and incorrect) dimension at the end.
const float2 curr_texel = tex_uv * texture_size;
const float2 prev_texel = floor(curr_texel - under_half.xx) + 0.5.xx;
const float2 first_texel = prev_texel - float2(samples.xx/2.0.xx - 1.0.xx);
const float2 first_texel_uv_wrap_2D = first_texel * dr;
const float2 first_texel_dist_2D = curr_texel - first_texel;
// Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
const float2 first_texel_tile_uv_wrap_2D =
first_texel_uv_wrap_2D * input_tiles_per_texture_r;
// Project wrapped coordinates to the [0, 1] range. We'll do this with all
// samples,but the first texel is special, since it might be negative.
const float2 coord_negative =
float2(first_texel_tile_uv_wrap_2D < 0.0.xx);
const float2 first_texel_tile_uv_2D =
frac(first_texel_tile_uv_wrap_2D) + coord_negative;
// Pack the first texel's tile_uv coord and texel distance in 1D:
const float2 tile_u_and_dist =
float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
const float2 tile_v_and_dist =
float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
return vertical ? tile_v_and_dist : tile_u_and_dist;
//return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
}
float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
{
// Mipmapping and anisotropic filtering get confused by sinc-resampling.
// One [slow] workaround is to select the lowest mip level:
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
return tex2Dlod(tex, float4(tex_uv, 0.0, 0.0));
#else
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
#else
return tex2D(tex, tex_uv);
#endif
#endif
}
////////////////////////////// LOOP BODY MACROS //////////////////////////////
// Using functions can exceed the temporary register limit, so we're
// stuck with #define macros (I'm TRULY sorry). They're declared here instead
// of above to be closer to the actual invocation sites. Steps:
// 1.) Get the exact texel location.
// 2.) Sample the phosphor mask (already assumed encoded in linear RGB).
// 3.) Get the distance from the current pixel and sinc weight:
// sinc(dist) = sin(pi * dist)/(pi * dist)
// We can also use the slower/smoother Lanczos instead:
// L(x) = sinc(dist) * sinc(dist / lobes)
// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels
// in pixel_color (we'll normalize outside the loop at the end).
// We vectorize the loop to help reduce the Lanczos window's cost.
// The r coord is the coord in the dimension we're resizing along (u or v),
// and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
// tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord
// for four new texel samples.
#define CALCULATE_R_COORD_FOR_4_SAMPLES \
const float4 true_i = float4(i_base + i,i_base + i,i_base + i,i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
const float4 tile_uv_r = frac( \
first_texel_tile_uv_rrrr + true_i * tile_dr); \
const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
#ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
#define CALCULATE_SINC_RESAMPLE_WEIGHTS \
const float4 pi_dist_over_lobes = pi_over_lobes * dist; \
const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
(pi_dist*pi_dist_over_lobes), 1.0.xxxx);
#else
#define CALCULATE_SINC_RESAMPLE_WEIGHTS \
const float4 weights = min(sin(pi_dist)/pi_dist, 1.0.xxxx);
#endif
#define UPDATE_COLOR_AND_WEIGHT_SUMS \
const float4 dist = magnification_scale * \
abs(first_dist_unscaled - true_i); \
const float4 pi_dist = pi * dist; \
CALCULATE_SINC_RESAMPLE_WEIGHTS; \
pixel_color += new_sample0 * weights.xxx; \
pixel_color += new_sample1 * weights.yyy; \
pixel_color += new_sample2 * weights.zzz; \
pixel_color += new_sample3 * weights.www; \
weight_sum += weights;
#define VERTICAL_SINC_RESAMPLE_LOOP_BODY \
CALCULATE_R_COORD_FOR_4_SAMPLES; \
const float3 new_sample0 = tex2Dlod0try(tex, \
float2(tex_uv.x, tex_uv_r.x)).rgb; \
const float3 new_sample1 = tex2Dlod0try(tex, \
float2(tex_uv.x, tex_uv_r.y)).rgb; \
const float3 new_sample2 = tex2Dlod0try(tex, \
float2(tex_uv.x, tex_uv_r.z)).rgb; \
const float3 new_sample3 = tex2Dlod0try(tex, \
float2(tex_uv.x, tex_uv_r.w)).rgb; \
UPDATE_COLOR_AND_WEIGHT_SUMS;
#define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \
CALCULATE_R_COORD_FOR_4_SAMPLES; \
const float3 new_sample0 = tex2Dlod0try(tex, \
float2(tex_uv_r.x, tex_uv.y)).rgb; \
const float3 new_sample1 = tex2Dlod0try(tex, \
float2(tex_uv_r.y, tex_uv.y)).rgb; \
const float3 new_sample2 = tex2Dlod0try(tex, \
float2(tex_uv_r.z, tex_uv.y)).rgb; \
const float3 new_sample3 = tex2Dlod0try(tex, \
float2(tex_uv_r.w, tex_uv.y)).rgb; \
UPDATE_COLOR_AND_WEIGHT_SUMS;
//////////////////////////// RESAMPLING FUNCTIONS ////////////////////////////
float3 downsample_vertical_sinc_tiled(const sampler2D tex,
const float2 tex_uv, const float2 texture_size, const float dr,
const float magnification_scale, const float tile_size_uv_r)
{
// Requires: 1.) dr == du == 1.0/texture_size.x or
// dr == dv == 1.0/texture_size.y
// (whichever direction we're resampling in).
// It's a scalar to save register space.
// 2.) tile_size_uv_r is the number of texels an input tile
// takes up in the input texture, in the direction we're
// resampling this pass.
// 3.) magnification_scale must be <= 1.0.
// Returns: Return a [Lanczos] sinc-resampled pixel of a vertically
// downsized input tile embedded in an input texture. (The
// vertical version is special-cased though: It assumes the
// tile size equals the [static] texture size, since it's used
// on an LUT texture input containing one tile. For more
// generic use, eliminate the "static" in the parameters.)
// The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
// we're resizing along, e.g. "dy" in this case.
#ifdef USE_SINGLE_STATIC_LOOP
// A static loop can be faster, but it might blur too much from using
// more samples than it should.
static const int samples = int(max_sinc_resize_samples_m4);
#else
const int samples = int(get_dynamic_loop_size(magnification_scale));
#endif
// Get the first sample location (scalar tile uv coord along the resized
// dimension) and distance from the output location (in texels):
static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
// true = vertical resize:
const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, true);
const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
// Get the tile sample offset:
static const float tile_dr = dr * input_tiles_per_texture_r;
// Sum up each weight and weighted sample color, varying the looping
// strategy based on our expected dynamic loop capabilities. See the
// loop body macros above.
int i_base = 0;
float4 weight_sum = 0.0.xxxx;
float3 pixel_color = 0.0.xxx;
static const int i_step = 4;
#ifdef BREAK_LOOPS_INTO_PIECES
if(samples - i_base >= 64)
{
for(int i = 0; i < 64; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 64;
}
if(samples - i_base >= 32)
{
for(int i = 0; i < 32; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 32;
}
if(samples - i_base >= 16)
{
for(int i = 0; i < 16; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 16;
}
if(samples - i_base >= 8)
{
for(int i = 0; i < 8; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 8;
}
if(samples - i_base >= 4)
{
for(int i = 0; i < 4; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 4;
}
// Do another 4-sample block for a total of 128 max samples.
if(samples - i_base > 0)
{
for(int i = 0; i < 4; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
}
#else
for(int i = 0; i < samples; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
#endif
// Normalize so the weight_sum == 1.0, and return:
const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
const float3 scalar_weight_sum = float3(weight_sum_reduce.xxx +
weight_sum_reduce.yyy);
return (pixel_color/scalar_weight_sum);
}
float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
const float2 tex_uv, const float2 texture_size, const float dr,
const float magnification_scale, const float tile_size_uv_r)
{
// Differences from downsample_horizontal_sinc_tiled:
// 1.) The dr and tile_size_uv_r parameters are not static consts.
// 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
// set to false instead of true.
// 3.) The horizontal version of the loop body is used.
// TODO: If we can get guaranteed compile-time dead code elimination,
// we can combine the vertical/horizontal downsampling functions by:
// 1.) Add an extra static const bool parameter called "vertical."
// 2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
// 3.) Use a conditional assignment in the loop body macro. This is the
// tricky part: We DO NOT want to incur the extra conditional
// assignment in the inner loop at runtime!
// The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
// we're resizing along, e.g. "dx" in this case.
#ifdef USE_SINGLE_STATIC_LOOP
// If we have to load all samples, we might as well use them.
static const int samples = int(max_sinc_resize_samples_m4);
#else
const int samples = int(get_dynamic_loop_size(magnification_scale));
#endif
// Get the first sample location (scalar tile uv coord along resized
// dimension) and distance from the output location (in texels):
const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
// false = horizontal resize:
const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, false);
const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
// Get the tile sample offset:
const float tile_dr = dr * input_tiles_per_texture_r;
// Sum up each weight and weighted sample color, varying the looping
// strategy based on our expected dynamic loop capabilities. See the
// loop body macros above.
int i_base = 0;
float4 weight_sum = 0.0.xxxx;
float3 pixel_color = 0.0.xxx;
static const int i_step = 4;
#ifdef BREAK_LOOPS_INTO_PIECES
if(samples - i_base >= 64)
{
for(int i = 0; i < 64; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 64;
}
if(samples - i_base >= 32)
{
for(int i = 0; i < 32; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 32;
}
if(samples - i_base >= 16)
{
for(int i = 0; i < 16; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 16;
}
if(samples - i_base >= 8)
{
for(int i = 0; i < 8; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 8;
}
if(samples - i_base >= 4)
{
for(int i = 0; i < 4; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 4;
}
// Do another 4-sample block for a total of 128 max samples.
if(samples - i_base > 0)
{
for(int i = 0; i < 4; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
}
#else
for(int i = 0; i < samples; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
#endif
// Normalize so the weight_sum == 1.0, and return:
const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
const float3 scalar_weight_sum = float3(weight_sum_reduce.xxx +
weight_sum_reduce.yyy);
return (pixel_color/scalar_weight_sum);
}
//////////////////////////// TILE SIZE CALCULATION ///////////////////////////
float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
const float2 estimated_mask_resize_output_size,
const bool solemnly_swear_same_inputs_for_every_pass)
{
// Requires: The following global constants must be defined according to
// certain constraints:
// 1.) mask_resize_num_triads: Must be high enough that our
// mask sampling method won't have artifacts later
// (long story; see derived-settings-and-constants.h)
// 2.) mask_resize_src_lut_size: Texel size of our mask LUT
// 3.) mask_triads_per_tile: Num horizontal triads in our LUT
// 4.) mask_min_allowed_triad_size: User setting (the more
// restrictive it is, the faster the resize will go)
// 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
// 6.) mask_triad_size_desired_{runtime, static}
// 7.) mask_num_triads_desired_{runtime, static}
// 8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
// The function parameters must be defined as follows:
// 1.) estimated_viewport_size == (final viewport size);
// If mask_specify_num_triads is 1.0/true and the viewport
// estimate is wrong, the number of triads will differ from
// the user's preference by about the same factor.
// 2.) estimated_mask_resize_output_size: Must equal the
// output size of the MASK_RESIZE pass.
// Exception: The x component may be estimated garbage if
// and only if the caller throws away the x result.
// 3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
// unless you can guarantee that every call across every
// pass will use the same sizes for the other parameters.
// When calling this across multiple passes, always use the
// same y viewport size/scale, and always use the same x
// viewport size/scale when using the x result.
// Returns: Return the final size of a manually resized mask tile, after
// constraining the desired size to avoid artifacts. Under
// unusual circumstances, tiles may become stretched vertically
// (see wall of text below).
// Stated tile properties must be correct:
static const float tile_aspect_ratio_inv =
mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
// If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
// wrong, the user preference will be misinterpreted:
const float desired_tile_size_x = mask_triads_per_tile * lerp(
mask_triad_size_desired,
estimated_viewport_size.x / mask_num_triads_desired,
mask_specify_num_triads);
if(get_mask_sample_mode() > 0.5)
{
// We don't need constraints unless we're sampling MASK_RESIZE.
return desired_tile_size_x * tile_aspect;
}
// Make sure we're not upsizing:
const float temp_tile_size_x =
min(desired_tile_size_x, mask_resize_src_lut_size.x);
// Enforce min_tile_size and max_tile_size in both dimensions:
const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
static const float2 min_tile_size =
mask_min_allowed_tile_size * tile_aspect;
const float2 max_tile_size =
estimated_mask_resize_output_size / mask_resize_num_tiles;
const float2 clamped_tile_size =
clamp(temp_tile_size, min_tile_size, max_tile_size);
// Try to maintain tile_aspect_ratio. This is the tricky part:
// If we're currently resizing in the y dimension, the x components
// could be MEANINGLESS. (If estimated_mask_resize_output_size.x is
// bogus, then so is max_tile_size.x and clamped_tile_size.x.)
// We can't adjust the y size based on clamped_tile_size.x. If it
// clamps when it shouldn't, it won't clamp again when later passes
// call this function with the correct sizes, and the discrepancy will
// break the sampling coords in MASKED_SCANLINES. Instead, we'll limit
// the x size based on the y size, but not vice versa, unless the
// caller swears the parameters were the same (correct) in every pass.
// As a result, triads could appear vertically stretched if:
// a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
// LUT's might clamp x more than y (all provided LUT's are square)
// b.) true_viewport_size.x < true_viewport_size.y: The user is playing
// with a vertically oriented screen (not accounted for anyway)
// c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
// Viewport scales are equal by default.
// If any of these are the case, you can fix the stretching by setting:
// mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
// (1.0 / min_expected_aspect_ratio) *
// (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
const float x_tile_size_from_y =
clamped_tile_size.y * tile_aspect_ratio;
const float y_tile_size_from_x = lerp(clamped_tile_size.y,
clamped_tile_size.x * tile_aspect_ratio_inv,
float(solemnly_swear_same_inputs_for_every_pass));
const float2 reclamped_tile_size = float2(
min(clamped_tile_size.x, x_tile_size_from_y),
min(clamped_tile_size.y, y_tile_size_from_x));
// We need integer tile sizes in both directions for tiled sampling to
// work correctly. Use floor (to make sure we don't round up), but be
// careful to avoid a rounding bug where floor decreases whole numbers:
const float2 final_resized_tile_size =
floor(reclamped_tile_size + float2(FIX_ZERO(0.0),FIX_ZERO(0.0)));
return final_resized_tile_size;
}
///////////////////////// FINAL MASK SAMPLING HELPERS ////////////////////////
float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
const float2 mask_resize_video_size, const float2 true_viewport_size,
out float2 mask_tiles_per_screen)
{
// Requires: 1.) Requirements of get_resized_mask_tile_size() must be
// met, particularly regarding global constants.
// The function parameters must be defined as follows:
// 1.) mask_resize_texture_size == MASK_RESIZE.texture_size
// if get_mask_sample_mode() is 0 (otherwise anything)
// 2.) mask_resize_video_size == MASK_RESIZE.video_size
// if get_mask_sample_mode() is 0 (otherwise anything)
// 3.) true_viewport_size == IN.output_size for a pass set to
// 1.0 viewport scale (i.e. it must be correct)
// Returns: Return a float4 containing:
// xy: tex_uv coords for the start of the mask tile
// zw: tex_uv size of the mask tile from start to end
// mask_tiles_per_screen is an out parameter containing the
// number of mask tiles that will fit on the screen.
// First get the final resized tile size. The viewport size and mask
// resize viewport scale must be correct, but don't solemnly swear they
// were correct in both mask resize passes unless you know it's true.
// (We can better ensure a correct tile aspect ratio if the parameters are
// guaranteed correct in all passes...but if we lie, we'll get inconsistent
// sizes across passes, resulting in broken texture coordinates.)
const float mask_sample_mode = get_mask_sample_mode();
const float2 mask_resize_tile_size = get_resized_mask_tile_size(
true_viewport_size, mask_resize_video_size, false);
if(mask_sample_mode < 0.5)
{
// Sample MASK_RESIZE: The resized tile is a fraction of the texture
// size and starts at a nonzero offset to allow for border texels:
const float2 mask_tile_uv_size = mask_resize_tile_size /
mask_resize_texture_size;
const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
// mask_tiles_per_screen must be based on the *true* viewport size:
mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
return float4(mask_tile_start_uv, mask_tile_uv_size);
}
else
{
// If we're tiling at the original size (1:1 pixel:texel), redefine a
// "tile" to be the full texture containing many triads. Otherwise,
// we're hardware-resampling an LUT, and the texture truly contains a
// single unresized phosphor mask tile anyway.
static const float2 mask_tile_uv_size = 1.0.xx;
static const float2 mask_tile_start_uv = 0.0.xx;
if(mask_sample_mode > 1.5)
{
// Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
}
else
{
// Hardware-resize the original LUT:
mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
}
return float4(mask_tile_start_uv, mask_tile_uv_size);
}
}
float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
float2 duv_dx, float2 duv_dy)
{
// Requires: 1.) duv_dx == ddx(tile_uv)
// 2.) duv_dy == ddy(tile_uv)
// 3.) tile_uv contains tile-relative uv coords in [0, 1],
// such that (0.5, 0.5) is the center of a tile, etc.
// ("Tile" can mean texture, the video embedded in the
// texture, or some other "tile" embedded in a texture.)
// Returns: Return new tile_uv coords that contain no discontinuities
// across a 2x2 pixel quad.
// Description:
// When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
// derivatives, which we assume happened if the absolute difference between
// any fragment in a 2x2 block is > ~half a tile. If the current block has
// a u or v discontinuity and the current fragment is in the first half of
// the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
// to that coord to make the 2x2 block continuous. (It will now have a
// coord > 1.0 in the padding area beyond the tile.) This function takes
// derivatives as parameters so the caller can reuse them.
// In case we're using high-quality (nVidia-style) derivatives, ensure
// diagonically opposite fragments see each other for correctness:
duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
const float2 pixel_in_first_half_tile = float2(tile_uv < 0.5.xx);
const float2 jump_exists = float2(duv_dx + duv_dy > 0.5.xx);
return tile_uv + jump_exists * pixel_in_first_half_tile;
}
float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
const float4 mask_tile_start_uv_and_size)
{
// Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the
// tile spans from [0, 1], such that (0.5, 0.5) is at the
// tile center. The input coords can range from [0, inf],
// and their fractional parts map to a repeated tile.
// ("Tile" can mean texture, the video embedded in the
// texture, or some other "tile" embedded in a texture.)
// 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
// for the start of the embedded tile in the full texture.
// 3.) mask_tile_start_uv_and_size.zw contains the [fractional]
// tex_uv size of the embedded tile in the full texture.
// Returns: Return tex_uv coords (used for texture sampling)
// corresponding to tile_uv_wrap.
if(get_mask_sample_mode() < 0.5)
{
// Manually repeat the resized mask tile to fill the screen:
// First get fractional tile_uv coords. Using frac/fmod on coords
// confuses anisotropic filtering; fix it as user options dictate.
// derived-settings-and-constants.h disables incompatible options.
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
#else
float2 tile_uv = frac(tile_uv_wrap);
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
const float2 tile_uv_dx = ddx(tile_uv);
const float2 tile_uv_dy = ddy(tile_uv);
tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
tile_uv_dx, tile_uv_dy);
#endif
// The tile is embedded in a padded FBO, and it may start at a
// nonzero offset if border texels are used to avoid artifacts:
const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
tile_uv * mask_tile_start_uv_and_size.zw;
return mask_tex_uv;
}
else
{
// Sample from the input phosphor mask texture with hardware tiling.
// If we're tiling at the original size (mode 2), the "tile" is the
// whole texture, and it contains a large number of triads mapped with
// a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single
// unresized tile. tile_uv_wrap already has correct coords for both!
return tile_uv_wrap;
}
}
#endif // PHOSPHOR_MASK_RESIZING_H

View file

@ -0,0 +1,243 @@
#ifndef QUAD_PIXEL_COMMUNICATION_H
#define QUAD_PIXEL_COMMUNICATION_H
///////////////////////////////// MIT LICENSE ////////////////////////////////
// Copyright (C) 2014 TroggleMonkey*
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
///////////////////////////////// DISCLAIMER /////////////////////////////////
// *This code was inspired by "Shader Amortization using Pixel Quad Message
// Passing" by Eric Penner, published in GPU Pro 2, Chapter VI.2. My intent
// is not to plagiarize his fundamentally similar code and assert my own
// copyright, but the algorithmic helper functions require so little code that
// implementations can't vary by much except bugfixes and conventions. I just
// wanted to license my own particular code here to avoid ambiguity and make it
// clear that as far as I'm concerned, people can do as they please with it.
///////////////////////////////// DESCRIPTION ////////////////////////////////
// Given screen pixel numbers, derive a "quad vector" describing a fragment's
// position in its 2x2 pixel quad. Given that vector, obtain the values of any
// variable at neighboring fragments.
// Requires: Using this file in general requires:
// 1.) ddx() and ddy() are present in the current Cg profile.
// 2.) The GPU driver is using fine/high-quality derivatives.
// Functions will give incorrect results if this is not true,
// so a test function is included.
///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES ////////////////////
float4 get_quad_vector_naive(const float4 output_pixel_num_wrt_uvxy)
{
// Requires: Two measures of the current fragment's output pixel number
// in the range ([0, IN.output_size.x), [0, IN.output_size.y)):
// 1.) output_pixel_num_wrt_uvxy.xy increase with uv coords.
// 2.) output_pixel_num_wrt_uvxy.zw increase with screen xy.
// Returns: Two measures of the fragment's position in its 2x2 quad:
// 1.) The .xy components are its 2x2 placement with respect to
// uv direction (the origin (0, 0) is at the top-left):
// top-left = (-1.0, -1.0) top-right = ( 1.0, -1.0)
// bottom-left = (-1.0, 1.0) bottom-right = ( 1.0, 1.0)
// You need this to arrange/weight shared texture samples.
// 2.) The .zw components are its 2x2 placement with respect to
// screen xy direction (IN.position); the origin varies.
// quad_gather needs this measure to work correctly.
// Note: quad_vector.zw = quad_vector.xy * float2(
// ddx(output_pixel_num_wrt_uvxy.x),
// ddy(output_pixel_num_wrt_uvxy.y));
// Caveats: This function assumes the GPU driver always starts 2x2 pixel
// quads at even pixel numbers. This assumption can be wrong
// for odd output resolutions (nondeterministically so).
const float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
const float4 quad_vector = pixel_odd * 2.0 - 1.0.xxxx;
return quad_vector;
}
float4 get_quad_vector(const float4 output_pixel_num_wrt_uvxy)
{
// Requires: Same as get_quad_vector_naive() (see that first).
// Returns: Same as get_quad_vector_naive() (see that first), but it's
// correct even if the 2x2 pixel quad starts at an odd pixel,
// which can occur at odd resolutions.
const float4 quad_vector_guess =
get_quad_vector_naive(output_pixel_num_wrt_uvxy);
// If quad_vector_guess.zw doesn't increase with screen xy, we know
// the 2x2 pixel quad starts at an odd pixel:
const float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
ddy(quad_vector_guess.w));
return quad_vector_guess * odd_start_mirror.xyxy;
}
float4 get_quad_vector(const float2 output_pixel_num_wrt_uv)
{
// Requires: 1.) ddx() and ddy() are present in the current Cg profile.
// 2.) output_pixel_num_wrt_uv must increase with uv coords and
// measure the current fragment's output pixel number in:
// ([0, IN.output_size.x), [0, IN.output_size.y))
// Returns: Same as get_quad_vector_naive() (see that first), but it's
// correct even if the 2x2 pixel quad starts at an odd pixel,
// which can occur at odd resolutions.
// Caveats: This function requires less information than the version
// taking a float4, but it's potentially slower.
// Do screen coords increase with or against uv? Get the direction
// with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
const float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
ddy(output_pixel_num_wrt_uv.y));
const float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
const float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - 0.5.xx) * 2.0;
const float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
// If quad_vector_screen_guess doesn't increase with screen xy, we know
// the 2x2 pixel quad starts at an odd pixel:
const float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
ddy(quad_vector_screen_guess.y));
const float4 quad_vector_guess = float4(
quad_vector_uv_guess, quad_vector_screen_guess);
return quad_vector_guess * odd_start_mirror.xyxy;
}
void quad_gather(const float4 quad_vector, const float4 curr,
out float4 adjx, out float4 adjy, out float4 diag)
{
// Requires: 1.) ddx() and ddy() are present in the current Cg profile.
// 2.) The GPU driver is using fine/high-quality derivatives.
// 3.) quad_vector describes the current fragment's location in
// its 2x2 pixel quad using get_quad_vector()'s conventions.
// 4.) curr is any vector you wish to get neighboring values of.
// Returns: Values of an input vector (curr) at neighboring fragments
// adjacent x, adjacent y, and diagonal (via out parameters).
adjx = curr - ddx(curr) * quad_vector.z;
adjy = curr - ddy(curr) * quad_vector.w;
diag = adjx - ddy(adjx) * quad_vector.w;
}
void quad_gather(const float4 quad_vector, const float3 curr,
out float3 adjx, out float3 adjy, out float3 diag)
{
// Float3 version
adjx = curr - ddx(curr) * quad_vector.z;
adjy = curr - ddy(curr) * quad_vector.w;
diag = adjx - ddy(adjx) * quad_vector.w;
}
void quad_gather(const float4 quad_vector, const float2 curr,
out float2 adjx, out float2 adjy, out float2 diag)
{
// Float2 version
adjx = curr - ddx(curr) * quad_vector.z;
adjy = curr - ddy(curr) * quad_vector.w;
diag = adjx - ddy(adjx) * quad_vector.w;
}
float4 quad_gather(const float4 quad_vector, const float curr)
{
// Float version:
// Returns: return.x == current
// return.y == adjacent x
// return.z == adjacent y
// return.w == diagonal
float4 all = curr.xxxx;
all.y = all.x - ddx(all.x) * quad_vector.z;
all.zw = all.xy - ddy(all.xy) * quad_vector.w;
return all;
}
float4 quad_gather_sum(const float4 quad_vector, const float4 curr)
{
// Requires: Same as quad_gather()
// Returns: Sum of an input vector (curr) at all fragments in a quad.
float4 adjx, adjy, diag;
quad_gather(quad_vector, curr, adjx, adjy, diag);
return (curr + adjx + adjy + diag);
}
float3 quad_gather_sum(const float4 quad_vector, const float3 curr)
{
// Float3 version:
float3 adjx, adjy, diag;
quad_gather(quad_vector, curr, adjx, adjy, diag);
return (curr + adjx + adjy + diag);
}
float2 quad_gather_sum(const float4 quad_vector, const float2 curr)
{
// Float2 version:
float2 adjx, adjy, diag;
quad_gather(quad_vector, curr, adjx, adjy, diag);
return (curr + adjx + adjy + diag);
}
float quad_gather_sum(const float4 quad_vector, const float curr)
{
// Float version:
const float4 all_values = quad_gather(quad_vector, curr);
return (all_values.x + all_values.y + all_values.z + all_values.w);
}
bool fine_derivatives_working(const float4 quad_vector, float4 curr)
{
// Requires: 1.) ddx() and ddy() are present in the current Cg profile.
// 2.) quad_vector describes the current fragment's location in
// its 2x2 pixel quad using get_quad_vector()'s conventions.
// 3.) curr must be a test vector with non-constant derivatives
// (its value should change nonlinearly across fragments).
// Returns: true if fine/hybrid/high-quality derivatives are used, or
// false if coarse derivatives are used or inconclusive
// Usage: Test whether quad-pixel communication is working!
// Method: We can confirm fine derivatives are used if the following
// holds (ever, for any value at any fragment):
// (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
// The more values we test (e.g. test a float4 two ways), the
// easier it is to demonstrate fine derivatives are working.
// TODO: Check for floating point exact comparison issues!
float4 ddx_curr = ddx(curr);
float4 ddy_curr = ddy(curr);
float4 adjx = curr - ddx_curr * quad_vector.z;
float4 adjy = curr - ddy_curr * quad_vector.w;
bool ddy_different = any(ddy_curr != ddy(adjx));
bool ddx_different = any(ddx_curr != ddx(adjy));
return any(bool2(ddy_different, ddx_different));
}
bool fine_derivatives_working_fast(const float4 quad_vector, float curr)
{
// Requires: Same as fine_derivatives_working()
// Returns: Same as fine_derivatives_working()
// Usage: This is faster than fine_derivatives_working() but more
// likely to return false negatives, so it's less useful for
// offline testing/debugging. It's also useless as the basis
// for dynamic runtime branching as of May 2014: Derivatives
// (and quad-pixel communication) are currently disallowed in
// branches. However, future GPU's may allow you to use them
// in dynamic branches if you promise the branch condition
// evaluates the same for every fragment in the quad (and/or if
// the driver enforces that promise by making a single fragment
// control branch decisions). If that ever happens, this
// version may become a more economical choice.
float ddx_curr = ddx(curr);
float ddy_curr = ddy(curr);
float adjx = curr - ddx_curr * quad_vector.z;
return (ddy_curr != ddy(adjx));
}
#endif // QUAD_PIXEL_COMMUNICATION_H

View file

@ -0,0 +1,569 @@
#ifndef SCANLINE_FUNCTIONS_H
#define SCANLINE_FUNCTIONS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////// INCLUDES //////////////////////////////////
#include "user-settings.fxh"
#include "derived-settings-and-constants.fxh"
#include "special-functions.fxh"
#include "gamma-management.fxh"
///////////////////////////// SCANLINE FUNCTIONS /////////////////////////////
float3 get_gaussian_sigma(const float3 color, const float sigma_range)
{
// Requires: Globals:
// 1.) beam_min_sigma and beam_max_sigma are global floats
// containing the desired minimum and maximum beam standard
// deviations, for dim and bright colors respectively.
// 2.) beam_max_sigma must be > 0.0
// 3.) beam_min_sigma must be in (0.0, beam_max_sigma]
// 4.) beam_spot_power must be defined as a global float.
// Parameters:
// 1.) color is the underlying source color along a scanline
// 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
// sigma_range as a parameter to avoid repeated computation
// when beam_{min, max}_sigma are runtime shader parameters
// Optional: Users may set beam_spot_shape_function to 1 to define the
// inner f(color) subfunction (see below) as:
// f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
// Otherwise (technically, if beam_spot_shape_function < 0.5):
// f(color) = pow(color, beam_spot_power)
// Returns: The standard deviation of the Gaussian beam for "color:"
// sigma = beam_min_sigma + sigma_range * f(color)
// Details/Discussion:
// The beam's spot shape vaguely resembles an aspect-corrected f() in the
// range [0, 1] (not quite, but it's related). f(color) = color makes
// spots look like diamonds, and a spherical function or cube balances
// between variable width and a soft/realistic shape. A beam_spot_power
// > 1.0 can produce an ugly spot shape and more initial clipping, but the
// final shape also differs based on the horizontal resampling filter and
// the phosphor bloom. For instance, resampling horizontally in nonlinear
// light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
// shape, but a sixth root is still quite soft. A power function (default
// 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
// has the highest variability without an awful spot shape.
//
// beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
// difference from beam_max_sigma affects beam width variability. It only
// affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
// a conservative estimate for a more complex constraint).
//
// beam_max_sigma affects clipping and increasing scanline width/softness
// as color increases. The wider this is, the more scanlines need to be
// evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma
// at which the first unused scanline always has a weight < 1.0/255.0 is:
// num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
// num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
// num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
// num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
// num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
// Generalized Gaussians permit more leeway here as steepness increases.
if(beam_spot_shape_function < 0.5)
{
// Use a power function:
return beam_min_sigma.xxx + sigma_range *
pow(color, beam_spot_power);
}
else
{
// Use a spherical function:
const float3 color_minus_1 = color - 1.0.xxx;
return beam_min_sigma.xxx + sigma_range *
sqrt(1.0.xxx - color_minus_1*color_minus_1);
}
}
float3 get_generalized_gaussian_beta(const float3 color,
const float shape_range)
{
// Requires: Globals:
// 1.) beam_min_shape and beam_max_shape are global floats
// containing the desired min/max generalized Gaussian
// beta parameters, for dim and bright colors respectively.
// 2.) beam_max_shape must be >= 2.0
// 3.) beam_min_shape must be in [2.0, beam_max_shape]
// 4.) beam_shape_power must be defined as a global float.
// Parameters:
// 1.) color is the underlying source color along a scanline
// 2.) shape_range = beam_max_shape - beam_min_shape; we take
// shape_range as a parameter to avoid repeated computation
// when beam_{min, max}_shape are runtime shader parameters
// Returns: The type-I generalized Gaussian "shape" parameter beta for
// the given color.
// Details/Discussion:
// Beta affects the scanline distribution as follows:
// a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
// b.) beta == 2.0 just degenerates to a Gaussian
// c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
// than a Gaussian. Whereas high sigmas widen and soften peaks, high
// beta widen and sharpen peaks at the risk of aliasing.
// Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
// transitions, whereas lower ones sharpen them (at the risk of aliasing).
return beam_min_shape + shape_range * pow(color, beam_shape_power);
}
float3 scanline_gaussian_integral_contrib(const float3 dist,
const float3 color, const float pixel_height, const float sigma_range)
{
// Requires: 1.) dist is the distance of the [potentially separate R/G/B]
// point(s) from a scanline in units of scanlines, where
// 1.0 means the sample point straddles the next scanline.
// 2.) color is the underlying source color along a scanline.
// 3.) pixel_height is the output pixel height in scanlines.
// 4.) Requirements of get_gaussian_sigma() must be met.
// Returns: Return a scanline's light output over a given pixel.
// Details:
// The CRT beam profile follows a roughly Gaussian distribution which is
// wider for bright colors than dark ones. The integral over the full
// range of a Gaussian function is always 1.0, so we can vary the beam
// with a standard deviation without affecting brightness. 'x' = distance:
// gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
// gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
// Use a numerical approximation of the "error function" (the Gaussian
// indefinite integral) to find the definite integral of the scanline's
// average brightness over a given pixel area. Even if curved coords were
// used in this pass, a flat scalar pixel height works almost as well as a
// pixel height computed from a full pixel-space to scanline-space matrix.
const float3 sigma = get_gaussian_sigma(color, sigma_range);
const float3 ph_offset = (pixel_height.xxx) * 0.5;
const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
const float3 integral_high = erf((dist + ph_offset)*denom_inv);
const float3 integral_low = erf((dist - ph_offset)*denom_inv);
return color * 0.5*(integral_high - integral_low)/pixel_height;
}
float3 scanline_generalized_gaussian_integral_contrib(const float3 dist,
const float3 color, const float pixel_height, const float sigma_range,
const float shape_range)
{
// Requires: 1.) Requirements of scanline_gaussian_integral_contrib()
// must be met.
// 2.) Requirements of get_gaussian_sigma() must be met.
// 3.) Requirements of get_generalized_gaussian_beta() must be
// met.
// Returns: Return a scanline's light output over a given pixel.
// A generalized Gaussian distribution allows the shape (beta) to vary
// as well as the width (alpha). "gamma" refers to the gamma function:
// generalized sample =
// beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
// ligamma(s, z) is the lower incomplete gamma function, for which we only
// implement two of four branches (because we keep 1/beta <= 0.5):
// generalized integral = 0.5 + 0.5* sign(x) *
// ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
// See get_generalized_gaussian_beta() for a discussion of beta.
// We base alpha on the intended Gaussian sigma, but it only strictly
// models models standard deviation at beta == 2, because the standard
// deviation depends on both alpha and beta (keeping alpha independent is
// faster and preserves intuitive behavior and a full spectrum of results).
const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
const float3 beta = get_generalized_gaussian_beta(color, shape_range);
const float3 alpha_inv = 1.0.xxx/alpha;
const float3 s = 1.0.xxx/beta;
const float3 ph_offset = (pixel_height.xxx) * 0.5;
// Pass beta to gamma_impl to avoid repeated divides. Similarly pass
// beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
const float3 gamma_s_inv = 1.0.xxx/gamma_impl(s, beta);
const float3 dist1 = dist + ph_offset;
const float3 dist0 = dist - ph_offset;
const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
return color * 0.5*(integral_high - integral_low)/pixel_height;
}
float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
const float pixel_height, const float sigma_range)
{
// See scanline_gaussian integral_contrib() for detailed comments!
// gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
const float3 sigma = get_gaussian_sigma(color, sigma_range);
// Avoid repeated divides:
const float3 sigma_inv = 1.0.xxx/sigma;
const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
if(beam_antialias_level > 0.5)
{
// Sample 1/3 pixel away in each direction as well:
const float3 sample_offset = pixel_height.xxx/3.0;
const float3 dist2 = dist + sample_offset;
const float3 dist3 = abs(dist - sample_offset);
// Average three pure Gaussian samples:
const float3 scale = color/3.0 * outer_denom_inv;
const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
return scale * (weight1 + weight2 + weight3);
}
else
{
return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
}
}
float3 scanline_generalized_gaussian_sampled_contrib(const float3 dist,
const float3 color, const float pixel_height, const float sigma_range,
const float shape_range)
{
// See scanline_generalized_gaussian_integral_contrib() for details!
// generalized sample =
// beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
const float3 beta = get_generalized_gaussian_beta(color, shape_range);
// Avoid repeated divides:
const float3 alpha_inv = 1.0.xxx/alpha;
const float3 beta_inv = 1.0.xxx/beta;
const float3 scale = color * beta * 0.5 * alpha_inv /
gamma_impl(beta_inv, beta);
if(beam_antialias_level > 0.5)
{
// Sample 1/3 pixel closer to and farther from the scanline too.
const float3 sample_offset = pixel_height.xxx/3.0;
const float3 dist2 = dist + sample_offset;
const float3 dist3 = abs(dist - sample_offset);
// Average three generalized Gaussian samples:
const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
return scale/3.0 * (weight1 + weight2 + weight3);
}
else
{
return scale * exp(-pow(abs(dist*alpha_inv), beta));
}
}
float3 scanline_contrib(float3 dist, float3 color,
float pixel_height, const float sigma_range, const float shape_range)
{
// Requires: 1.) Requirements of scanline_gaussian_integral_contrib()
// must be met.
// 2.) Requirements of get_gaussian_sigma() must be met.
// 3.) Requirements of get_generalized_gaussian_beta() must be
// met.
// Returns: Return a scanline's light output over a given pixel, using
// a generalized or pure Gaussian distribution and sampling or
// integrals as desired by user codepath choices.
if(beam_generalized_gaussian)
{
if(beam_antialias_level > 1.5)
{
return scanline_generalized_gaussian_integral_contrib(
dist, color, pixel_height, sigma_range, shape_range);
}
else
{
return scanline_generalized_gaussian_sampled_contrib(
dist, color, pixel_height, sigma_range, shape_range);
}
}
else
{
if(beam_antialias_level > 1.5)
{
return scanline_gaussian_integral_contrib(
dist, color, pixel_height, sigma_range);
}
else
{
return scanline_gaussian_sampled_contrib(
dist, color, pixel_height, sigma_range);
}
}
}
float3 get_raw_interpolated_color(const float3 color0,
const float3 color1, const float3 color2, const float3 color3,
const float4 weights)
{
// Use max to avoid bizarre artifacts from negative colors:
return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
}
float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
const float3 color2, const float3 color3, const float4 weights)
{
// Requires: 1.) Requirements of include/gamma-management.h must be met:
// intermediate_gamma must be globally defined, and input
// colors are interpreted as linear RGB unless you #define
// GAMMA_ENCODE_EVERY_FBO (in which case they are
// interpreted as gamma-encoded with intermediate_gamma).
// 2.) color0-3 are colors sampled from a texture with tex2D().
// They are interpreted as defined in requirement 1.
// 3.) weights contains weights for each color, summing to 1.0.
// 4.) beam_horiz_linear_rgb_weight must be defined as a global
// float in [0.0, 1.0] describing how much blending should
// be done in linear RGB (rest is gamma-corrected RGB).
// 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
// if beam_horiz_linear_rgb_weight is anything other than a
// static constant, or we may try branching at runtime
// without dynamic branches allowed (slow).
// Returns: Return an interpolated color lookup between the four input
// colors based on the weights in weights. The final color will
// be a linear RGB value, but the blending will be done as
// indicated above.
const float intermediate_gamma = get_intermediate_gamma();
// Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
// profile allows dynamic branches (faster than computing extra pows):
#ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
#else
#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
#define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
#endif
#endif
#ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
// beam_horiz_linear_rgb_weight is static, so we can branch:
#ifdef GAMMA_ENCODE_EVERY_FBO
const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
color0, color1, color2, color3, weights), intermediate_gamma);
if(beam_horiz_linear_rgb_weight > 0.0)
{
const float3 linear_mixed_color = get_raw_interpolated_color(
pow(color0, intermediate_gamma),
pow(color1, intermediate_gamma),
pow(color2, intermediate_gamma),
pow(color3, intermediate_gamma),
weights);
return lerp(gamma_mixed_color, linear_mixed_color,
beam_horiz_linear_rgb_weight);
}
else
{
return gamma_mixed_color;
}
#else
const float3 linear_mixed_color = get_raw_interpolated_color(
color0, color1, color2, color3, weights);
if(beam_horiz_linear_rgb_weight < 1.0)
{
const float3 gamma_mixed_color = get_raw_interpolated_color(
pow(color0, 1.0/intermediate_gamma),
pow(color1, 1.0/intermediate_gamma),
pow(color2, 1.0/intermediate_gamma),
pow(color3, 1.0/intermediate_gamma),
weights);
return lerp(gamma_mixed_color, linear_mixed_color,
beam_horiz_linear_rgb_weight);
}
else
{
return linear_mixed_color;
}
#endif // GAMMA_ENCODE_EVERY_FBO
#else
#ifdef GAMMA_ENCODE_EVERY_FBO
// Inputs: color0-3 are colors in gamma-encoded RGB.
const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
color0, color1, color2, color3, weights), intermediate_gamma);
const float3 linear_mixed_color = get_raw_interpolated_color(
pow(color0, intermediate_gamma),
pow(color1, intermediate_gamma),
pow(color2, intermediate_gamma),
pow(color3, intermediate_gamma),
weights);
return lerp(gamma_mixed_color, linear_mixed_color,
beam_horiz_linear_rgb_weight);
#else
// Inputs: color0-3 are colors in linear RGB.
const float3 linear_mixed_color = get_raw_interpolated_color(
color0, color1, color2, color3, weights);
const float3 gamma_mixed_color = get_raw_interpolated_color(
pow(color0, 1.0/intermediate_gamma),
pow(color1, 1.0/intermediate_gamma),
pow(color2, 1.0/intermediate_gamma),
pow(color3, 1.0/intermediate_gamma),
weights);
return lerp(gamma_mixed_color, linear_mixed_color,
beam_horiz_linear_rgb_weight);
#endif // GAMMA_ENCODE_EVERY_FBO
#endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
}
float3 get_scanline_color(const sampler2D Source, const float2 scanline_uv,
const float2 uv_step_x, const float4 weights)
{
// Requires: 1.) scanline_uv must be vertically snapped to the caller's
// desired line or scanline and horizontally snapped to the
// texel just left of the output pixel (color1)
// 2.) uv_step_x must contain the horizontal uv distance
// between texels.
// 3.) weights must contain interpolation filter weights for
// color0, color1, color2, and color3, where color1 is just
// left of the output pixel.
// Returns: Return a horizontally interpolated texture lookup using 2-4
// nearby texels, according to weights and the conventions of
// get_interpolated_linear_color().
// We can ignore the outside texture lookups for Quilez resampling.
const float3 color1 = tex2D(Source, scanline_uv).rgb;
const float3 color2 = tex2D(Source, scanline_uv + uv_step_x).rgb;
float3 color0 = 0.0.xxx;
float3 color3 = 0.0.xxx;
if(beam_horiz_filter > 0.5)
{
color0 = tex2D(Source, scanline_uv - uv_step_x).rgb;
color3 = tex2D(Source, scanline_uv + 2.0 * uv_step_x).rgb;
}
// Sample the texture as-is, whether it's linear or gamma-encoded:
// get_interpolated_linear_color() will handle the difference.
return get_interpolated_linear_color(color0, color1, color2, color3, weights);
}
float3 sample_single_scanline_horizontal(const sampler2D Source,
const float2 tex_uv, const float2 texture_size,
const float2 texture_size_inv)
{
// TODO: Add function requirements.
// Snap to the previous texel and get sample dists from 2/4 nearby texels:
const float2 curr_texel = tex_uv * texture_size;
// Use under_half to fix a rounding bug right around exact texel locations.
const float2 prev_texel =
floor(curr_texel - under_half.xx) + 0.5.xx;
const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
const float prev_dist = curr_texel.x - prev_texel_hor.x;
const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
1.0 - prev_dist, 2.0 - prev_dist);
// Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
float4 weights;
if(beam_horiz_filter < 0.5)
{
// Quilez:
const float x = sample_dists.y;
const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
weights = float4(0.0, 1.0 - w2, w2, 0.0);
}
else if(beam_horiz_filter < 1.5)
{
// Gaussian:
float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
}
else
{
// Lanczos2:
const float4 pi_dists = FIX_ZERO(sample_dists * pi);
weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
(pi_dists * pi_dists);
}
// Ensure the weight sum == 1.0:
const float4 final_weights = weights/dot(weights, 1.0.xxxx);
// Get the interpolated horizontal scanline color:
const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
return get_scanline_color(
Source, prev_texel_hor_uv, uv_step_x, final_weights);
}
float3 sample_rgb_scanline_horizontal(const sampler2D Source,
const float2 tex_uv, const float2 texture_size,
const float2 texture_size_inv)
{
// TODO: Add function requirements.
// Rely on a helper to make convergence easier.
if(beam_misconvergence)
{
const float3 convergence_offsets_rgb =
get_convergence_offsets_x_vector();
const float3 offset_u_rgb =
convergence_offsets_rgb * texture_size_inv.xxx;
const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
const float3 sample_r = sample_single_scanline_horizontal(
Source, scanline_uv_r, texture_size, texture_size_inv);
const float3 sample_g = sample_single_scanline_horizontal(
Source, scanline_uv_g, texture_size, texture_size_inv);
const float3 sample_b = sample_single_scanline_horizontal(
Source, scanline_uv_b, texture_size, texture_size_inv);
return float3(sample_r.r, sample_g.g, sample_b.b);
}
else
{
return sample_single_scanline_horizontal(Source, tex_uv, texture_size,
texture_size_inv);
}
}
float2 get_last_scanline_uv(const float2 tex_uv, const float2 texture_size,
const float2 texture_size_inv, const float2 il_step_multiple,
const float frame_count, out float dist)
{
// Compute texture coords for the last/upper scanline, accounting for
// interlacing: With interlacing, only consider even/odd scanlines every
// other frame. Top-field first (TFF) order puts even scanlines on even
// frames, and BFF order puts them on odd frames. Texels are centered at:
// frac(tex_uv * texture_size) == x.5
// Caution: If these coordinates ever seem incorrect, first make sure it's
// not because anisotropic filtering is blurring across field boundaries.
// Note: TFF/BFF won't matter for sources that double-weave or similar.
const float field_offset = floor(il_step_multiple.y * 0.75) *
fmod(frame_count + float(interlace_bff), 2.0);
const float2 curr_texel = tex_uv * texture_size;
// Use under_half to fix a rounding bug right around exact texel locations.
// This causes an insane bug on duckstation, so it's disabled here. (Hyllian, 2024)
// const float2 prev_texel_num = floor(curr_texel - under_half.xx);
const float2 prev_texel_num = curr_texel;
const float wrong_field = fmod(
prev_texel_num.y + field_offset, il_step_multiple.y);
const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
// Snap to the center of the previous scanline in the current field:
const float2 scanline_texel = scanline_texel_num + 0.5.xx;
const float2 scanline_uv = scanline_texel * texture_size_inv;
// Save the sample's distance from the scanline, in units of scanlines:
dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
return scanline_uv;
}
bool is_interlaced(float num_lines)
{
// Detect interlacing based on the number of lines in the source.
if(interlace_detect)
{
// NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
// NTSC Emulators: Typically 224 or 240 lines
// PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
// PAL Emulators: ?
// ATSC: 720p, 1080i, 1080p
// Where do we place our cutoffs? Assumptions:
// 1.) We only need to care about active lines.
// 2.) Anything > 288 and <= 576 lines is probably interlaced.
// 3.) Anything > 576 lines is probably not interlaced...
// 4.) ...except 1080 lines, which is a crapshoot (user decision).
// 5.) Just in case the main program uses calculated video sizes,
// we should nudge the float thresholds a bit.
const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
const bool hd_interlace = interlace_1080i ?
((num_lines > 1079.5) && (num_lines < 1080.5)) :
false;
return (sd_interlace || hd_interlace);
}
else
{
return false;
}
}
#endif // SCANLINE_FUNCTIONS_H

View file

@ -0,0 +1,498 @@
#ifndef SPECIAL_FUNCTIONS_H
#define SPECIAL_FUNCTIONS_H
///////////////////////////////// MIT LICENSE ////////////////////////////////
// Copyright (C) 2014 TroggleMonkey
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
///////////////////////////////// DESCRIPTION ////////////////////////////////
// This file implements the following mathematical special functions:
// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
// 2.) gamma(s), a real-numbered extension of the integer factorial function
// It also implements normalized_ligamma(s, z), a normalized lower incomplete
// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can
// be called with an _impl suffix to use an implementation version with a few
// extra precomputed parameters (which may be useful for the caller to reuse).
// See below for details.
//
// Design Rationale:
// Pretty much every line of code in this file is duplicated four times for
// different input types (float4/float3/float2/float). This is unfortunate,
// but Cg doesn't allow function templates. Macros would be far less verbose,
// but they would make the code harder to document and read. I don't expect
// these functions will require a whole lot of maintenance changes unless
// someone ever has need for more robust incomplete gamma functions, so code
// duplication seems to be the lesser evil in this case.
/////////////////////////// GAUSSIAN ERROR FUNCTION //////////////////////////
float4 erf6(float4 x)
{
// Requires: x is the standard parameter to erf().
// Returns: Return an Abramowitz/Stegun approximation of erf(), where:
// erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
// This approximation has a max absolute error of 2.5*10**-5
// with solid numerical robustness and efficiency. See:
// https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
static const float4 one = 1.0.xxxx;
const float4 sign_x = sign(x);
const float4 t = one/(one + 0.47047*abs(x));
const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
exp(-(x*x));
return result * sign_x;
}
float3 erf6(const float3 x)
{
// Float3 version:
static const float3 one = 1.0.xxx;
const float3 sign_x = sign(x);
const float3 t = one/(one + 0.47047*abs(x));
const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
exp(-(x*x));
return result * sign_x;
}
float2 erf6(const float2 x)
{
// Float2 version:
static const float2 one = 1.0.xx;
const float2 sign_x = sign(x);
const float2 t = one/(one + 0.47047*abs(x));
const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
exp(-(x*x));
return result * sign_x;
}
float erf6(const float x)
{
// Float version:
const float sign_x = sign(x);
const float t = 1.0/(1.0 + 0.47047*abs(x));
const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
exp(-(x*x));
return result * sign_x;
}
float4 erft(const float4 x)
{
// Requires: x is the standard parameter to erf().
// Returns: Approximate erf() with the hyperbolic tangent. The error is
// visually noticeable, but it's blazing fast and perceptually
// close...at least on ATI hardware. See:
// http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
// Warning: Only use this if your hardware drivers correctly implement
// tanh(): My nVidia 8800GTS returns garbage output.
return tanh(1.202760580 * x);
}
float3 erft(const float3 x)
{
// Float3 version:
return tanh(1.202760580 * x);
}
float2 erft(const float2 x)
{
// Float2 version:
return tanh(1.202760580 * x);
}
float erft(const float x)
{
// Float version:
return tanh(1.202760580 * x);
}
float4 erf(const float4 x)
{
// Requires: x is the standard parameter to erf().
// Returns: Some approximation of erf(x), depending on user settings.
#ifdef ERF_FAST_APPROXIMATION
return erft(x);
#else
return erf6(x);
#endif
}
float3 erf(const float3 x)
{
// Float3 version:
#ifdef ERF_FAST_APPROXIMATION
return erft(x);
#else
return erf6(x);
#endif
}
float2 erf(const float2 x)
{
// Float2 version:
#ifdef ERF_FAST_APPROXIMATION
return erft(x);
#else
return erf6(x);
#endif
}
float erf(const float x)
{
// Float version:
#ifdef ERF_FAST_APPROXIMATION
return erft(x);
#else
return erf6(x);
#endif
}
/////////////////////////// COMPLETE GAMMA FUNCTION //////////////////////////
float4 gamma_impl(const float4 s, const float4 s_inv)
{
// Requires: 1.) s is the standard parameter to the gamma function, and
// it should lie in the [0, 36] range.
// 2.) s_inv = 1.0/s. This implementation function requires
// the caller to precompute this value, giving users the
// opportunity to reuse it.
// Returns: Return approximate gamma function (real-numbered factorial)
// output using the Lanczos approximation with two coefficients
// calculated using Paul Godfrey's method here:
// http://my.fit.edu/~gabdo/gamma.txt
// An optimal g value for s in [0, 36] is ~1.12906830989, with
// a maximum relative error of 0.000463 for 2**16 equally
// evals. We could use three coeffs (0.0000346 error) without
// hurting latency, but this allows more parallelism with
// outside instructions.
static const float4 g = 1.12906830989.xxxx;
static const float4 c0 = 0.8109119309638332633713423362694399653724431.xxxx;
static const float4 c1 = 0.4808354605142681877121661197951496120000040.xxxx;
static const float4 e = 2.71828182845904523536028747135266249775724709.xxxx;
const float4 sph = s + 0.5.xxxx;
const float4 lanczos_sum = c0 + c1/(s + 1.0.xxxx);
const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e
// gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
// This has less error for small s's than (s -= 1.0) at the beginning.
return (pow(base, sph) * lanczos_sum) * s_inv;
}
float3 gamma_impl(const float3 s, const float3 s_inv)
{
// Float3 version:
static const float3 g = 1.12906830989.xxx;
static const float3 c0 = 0.8109119309638332633713423362694399653724431.xxx;
static const float3 c1 = 0.4808354605142681877121661197951496120000040.xxx;
static const float3 e = 2.71828182845904523536028747135266249775724709.xxx;
const float3 sph = s + 0.5.xxx;
const float3 lanczos_sum = c0 + c1/(s + 1.0.xxx);
const float3 base = (sph + g)/e;
return (pow(base, sph) * lanczos_sum) * s_inv;
}
float2 gamma_impl(const float2 s, const float2 s_inv)
{
// Float2 version:
static const float2 g = 1.12906830989.xx;
static const float2 c0 = 0.8109119309638332633713423362694399653724431.xx;
static const float2 c1 = 0.4808354605142681877121661197951496120000040.xx;
static const float2 e = 2.71828182845904523536028747135266249775724709.xx;
const float2 sph = s + 0.5.xx;
const float2 lanczos_sum = c0 + c1/(s + 1.0.xx);
const float2 base = (sph + g)/e;
return (pow(base, sph) * lanczos_sum) * s_inv;
}
float gamma_impl(const float s, const float s_inv)
{
// Float version:
static const float g = 1.12906830989;
static const float c0 = 0.8109119309638332633713423362694399653724431;
static const float c1 = 0.4808354605142681877121661197951496120000040;
static const float e = 2.71828182845904523536028747135266249775724709;
const float sph = s + 0.5;
const float lanczos_sum = c0 + c1/(s + 1.0);
const float base = (sph + g)/e;
return (pow(base, sph) * lanczos_sum) * s_inv;
}
float4 gamma(const float4 s)
{
// Requires: s is the standard parameter to the gamma function, and it
// should lie in the [0, 36] range.
// Returns: Return approximate gamma function output with a maximum
// relative error of 0.000463. See gamma_impl for details.
return gamma_impl(s, 1.0.xxxx/s);
}
float3 gamma(const float3 s)
{
// Float3 version:
return gamma_impl(s, 1.0.xxx/s);
}
float2 gamma(const float2 s)
{
// Float2 version:
return gamma_impl(s, 1.0.xx/s);
}
float gamma(const float s)
{
// Float version:
return gamma_impl(s, 1.0/s);
}
//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) ///////////////
// Lower incomplete gamma function for small s and z (implementation):
float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
{
// Requires: 1.) s < ~0.5
// 2.) z <= ~0.775075
// 3.) s_inv = 1.0/s (precomputed for outside reuse)
// Returns: A series representation for the lower incomplete gamma
// function for small s and small z (4 terms).
// The actual "rolled up" summation looks like:
// last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
// sum = last_sign * last_pow / ((s + k) * last_factorial)
// for(int i = 0; i < 4; ++i)
// {
// last_sign *= -1.0; last_pow *= z; last_factorial *= i;
// sum += last_sign * last_pow / ((s + k) * last_factorial);
// }
// Unrolled, constant-unfolded and arranged for madds and parallelism:
const float4 scale = pow(z, s);
float4 sum = s_inv; // Summation iteration 0 result
// Summation iterations 1, 2, and 3:
const float4 z_sq = z*z;
const float4 denom1 = s + 1.0.xxxx;
const float4 denom2 = 2.0*s + 4.0.xxxx;
const float4 denom3 = 6.0*s + 18.0.xxxx;
//float4 denom4 = 24.0*s + float4(96.0);
sum -= z/denom1;
sum += z_sq/denom2;
sum -= z * z_sq/denom3;
//sum += z_sq * z_sq / denom4;
// Scale and return:
return scale * sum;
}
float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
{
// Float3 version:
const float3 scale = pow(z, s);
float3 sum = s_inv;
const float3 z_sq = z*z;
const float3 denom1 = s + 1.0.xxx;
const float3 denom2 = 2.0*s + 4.0.xxx;
const float3 denom3 = 6.0*s + 18.0.xxx;
sum -= z/denom1;
sum += z_sq/denom2;
sum -= z * z_sq/denom3;
return scale * sum;
}
float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
{
// Float2 version:
const float2 scale = pow(z, s);
float2 sum = s_inv;
const float2 z_sq = z*z;
const float2 denom1 = s + 1.0.xx;
const float2 denom2 = 2.0*s + 4.0.xx;
const float2 denom3 = 6.0*s + 18.0.xx;
sum -= z/denom1;
sum += z_sq/denom2;
sum -= z * z_sq/denom3;
return scale * sum;
}
float ligamma_small_z_impl(const float s, const float z, const float s_inv)
{
// Float version:
const float scale = pow(z, s);
float sum = s_inv;
const float z_sq = z*z;
const float denom1 = s + 1.0;
const float denom2 = 2.0*s + 4.0;
const float denom3 = 6.0*s + 18.0;
sum -= z/denom1;
sum += z_sq/denom2;
sum -= z * z_sq/denom3;
return scale * sum;
}
// Upper incomplete gamma function for small s and large z (implementation):
float4 uigamma_large_z_impl(const float4 s, const float4 z)
{
// Requires: 1.) s < ~0.5
// 2.) z > ~0.775075
// Returns: Gauss's continued fraction representation for the upper
// incomplete gamma function (4 terms).
// The "rolled up" continued fraction looks like this. The denominator
// is truncated, and it's calculated "from the bottom up:"
// denom = float4('inf');
// float4 one = float4(1.0);
// for(int i = 4; i > 0; --i)
// {
// denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
// }
// Unrolled and constant-unfolded for madds and parallelism:
const float4 numerator = pow(z, s) * exp(-z);
float4 denom = 7.0.xxxx + z - s;
denom = 5.0.xxxx + z - s + (3.0*s - 9.0.xxxx)/denom;
denom = 3.0.xxxx + z - s + (2.0*s - 4.0.xxxx)/denom;
denom = 1.0.xxxx + z - s + (s - 1.0.xxxx)/denom;
return numerator / denom;
}
float3 uigamma_large_z_impl(const float3 s, const float3 z)
{
// Float3 version:
const float3 numerator = pow(z, s) * exp(-z);
float3 denom = 7.0.xxx + z - s;
denom = 5.0.xxx + z - s + (3.0*s - 9.0.xxx)/denom;
denom = 3.0.xxx + z - s + (2.0*s - 4.0.xxx)/denom;
denom = 1.0.xxx + z - s + (s - 1.0.xxx)/denom;
return numerator / denom;
}
float2 uigamma_large_z_impl(const float2 s, const float2 z)
{
// Float2 version:
const float2 numerator = pow(z, s) * exp(-z);
float2 denom = 7.0.xx + z - s;
denom = 5.0.xx + z - s + (3.0*s - 9.0.xx)/denom;
denom = 3.0.xx + z - s + (2.0*s - 4.0.xx)/denom;
denom = 1.0.xx + z - s + (s - 1.0.xx)/denom;
return numerator / denom;
}
float uigamma_large_z_impl(const float s, const float z)
{
// Float version:
const float numerator = pow(z, s) * exp(-z);
float denom = 7.0 + z - s;
denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
denom = 1.0 + z - s + (s - 1.0)/denom;
return numerator / denom;
}
// Normalized lower incomplete gamma function for small s (implementation):
float4 normalized_ligamma_impl(const float4 s, const float4 z,
const float4 s_inv, const float4 gamma_s_inv)
{
// Requires: 1.) s < ~0.5
// 2.) s_inv = 1/s (precomputed for outside reuse)
// 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
// Returns: Approximate the normalized lower incomplete gamma function
// for s < 0.5. Since we only care about s < 0.5, we only need
// to evaluate two branches (not four) based on z. Each branch
// uses four terms, with a max relative error of ~0.00182. The
// branch threshold and specifics were adapted for fewer terms
// from Gil/Segura/Temme's paper here:
// http://oai.cwi.nl/oai/asset/20433/20433B.pdf
// Evaluate both branches: Real branches test slower even when available.
static const float4 thresh = 0.775075.xxxx;
const bool4 z_is_large = z > thresh;
const float4 large_z = 1.0.xxxx - uigamma_large_z_impl(s, z) * gamma_s_inv;
const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
// Combine the results from both branches:
return large_z * float4(z_is_large.xxxx) + small_z * float4(!z_is_large.xxxx);
}
float3 normalized_ligamma_impl(const float3 s, const float3 z,
const float3 s_inv, const float3 gamma_s_inv)
{
// Float3 version:
static const float3 thresh = 0.775075.xxx;
const bool3 z_is_large = z > thresh;
const float3 large_z = 1.0.xxx - uigamma_large_z_impl(s, z) * gamma_s_inv;
const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
return large_z * float3(z_is_large.xxx) + small_z * float3(!z_is_large.xxx);
}
float2 normalized_ligamma_impl(const float2 s, const float2 z,
const float2 s_inv, const float2 gamma_s_inv)
{
// Float2 version:
static const float2 thresh = 0.775075.xx;
const bool2 z_is_large = z > thresh;
const float2 large_z = 1.0.xx - uigamma_large_z_impl(s, z) * gamma_s_inv;
const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
return large_z * float2(z_is_large.xx) + small_z * float2(!z_is_large.xx);
}
float normalized_ligamma_impl(const float s, const float z,
const float s_inv, const float gamma_s_inv)
{
// Float version:
static const float thresh = 0.775075;
const bool z_is_large = z > thresh;
const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
return large_z * float(z_is_large) + small_z * float(!z_is_large);
}
// Normalized lower incomplete gamma function for small s:
float4 normalized_ligamma(const float4 s, const float4 z)
{
// Requires: s < ~0.5
// Returns: Approximate the normalized lower incomplete gamma function
// for s < 0.5. See normalized_ligamma_impl() for details.
const float4 s_inv = 1.0.xxxx/s;
const float4 gamma_s_inv = 1.0.xxxx/gamma_impl(s, s_inv);
return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
}
float3 normalized_ligamma(const float3 s, const float3 z)
{
// Float3 version:
const float3 s_inv = 1.0.xxx/s;
const float3 gamma_s_inv = 1.0.xxx/gamma_impl(s, s_inv);
return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
}
float2 normalized_ligamma(const float2 s, const float2 z)
{
// Float2 version:
const float2 s_inv = 1.0.xx/s;
const float2 gamma_s_inv = 1.0.xx/gamma_impl(s, s_inv);
return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
}
float normalized_ligamma(const float s, const float z)
{
// Float version:
const float s_inv = 1.0/s;
const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
}
#endif // SPECIAL_FUNCTIONS_H

View file

@ -0,0 +1,58 @@
#ifndef USER_CGP_CONSTANTS_H
#define USER_CGP_CONSTANTS_H
// IMPORTANT:
// These constants MUST be set appropriately for the settings in crt-royale.cgp
// (or whatever related .cgp file you're using). If they aren't, you're likely
// to get artifacts, the wrong phosphor mask size, etc. I wish these could be
// set directly in the .cgp file to make things easier, but...they can't.
// PASS SCALES AND RELATED CONSTANTS:
// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of
// this shader: One does a viewport-scale bloom, and the other skips it. The
// latter benefits from a higher bloom_approx_scale_x, so save both separately:
static const float bloom_approx_size_x = 320.0;
static const float bloom_approx_size_x_for_fake = 400.0;
// Copy the viewport-relative scales of the phosphor mask resize passes
// (MASK_RESIZE and the pass immediately preceding it):
static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
static const float geom_max_aspect_ratio = 4.0/3.0;
// PHOSPHOR MASK TEXTURE CONSTANTS:
// Set the following constants to reflect the properties of the phosphor mask
// texture named in crt-royale.cgp. The shader optionally resizes a mask tile
// based on user settings, then repeats a single tile until filling the screen.
// The shader must know the input texture size (default 64x64), and to manually
// resize, it must also know the horizontal triads per tile (default 8).
static const float2 mask_texture_small_size = 64.0.xx;
static const float2 mask_texture_large_size = 512.0.xx;
static const float mask_triads_per_tile = 8.0;
// We need the average brightness of the phosphor mask to compensate for the
// dimming it causes. The following four values are roughly correct for the
// masks included with the shader. Update the value for any LUT texture you
// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
//#define PHOSPHOR_MASK_GRILLE14
static const float mask_grille14_avg_color = 50.6666666/255.0;
// TileableLinearApertureGrille14Wide7d33Spacing*.png
// TileableLinearApertureGrille14Wide10And6Spacing*.png
static const float mask_grille15_avg_color = 53.0/255.0;
// TileableLinearApertureGrille15Wide6d33Spacing*.png
// TileableLinearApertureGrille15Wide8And5d5Spacing*.png
static const float mask_slot_avg_color = 46.0/255.0;
// TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
// TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
static const float mask_shadow_avg_color = 41.0/255.0;
// TileableLinearShadowMask*.png
// TileableLinearShadowMaskEDP*.png
#ifdef PHOSPHOR_MASK_GRILLE14
static const float mask_grille_avg_color = mask_grille14_avg_color;
#else
static const float mask_grille_avg_color = mask_grille15_avg_color;
#endif
#endif // USER_CGP_CONSTANTS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = true;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect_static = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 6.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = true;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,97 @@
///////////////////////////////// MIT LICENSE ////////////////////////////////
// Copyright (C) 2014 TroggleMonkey
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
// PASS SETTINGS:
// gamma-management.h needs to know what kind of pipeline we're using and
// what pass this is in that pipeline. This will become obsolete if/when we
// can #define things like this in the .cgp preset file.
//#define GAMMA_ENCODE_EVERY_FBO
//#define FIRST_PASS
//#define LAST_PASS
//#define SIMULATE_CRT_ON_LCD
//#define SIMULATE_GBA_ON_LCD
//#define SIMULATE_LCD_ON_CRT
//#define SIMULATE_GBA_ON_CRT
////////////////////////////////// INCLUDES //////////////////////////////////
// #included by vertex shader:
#include "../include/gamma-management.fxh"
#include "../include/blur-functions.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p4
{
float2 blur_dxdy : TEXCOORD1;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Blur9Fast_Horizontal(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p4 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
/* float2 texture_size = 1.0/NormalizedNativePixelSize;
float2 output_size = (ViewportSize*BufferToViewportRatio);
float2 video_size = 1.0/NormalizedNativePixelSize;
*/
// float2 texture_size = float2(320.0, 240.0);
float2 texture_size = HALATION_BLUR_texture_size;
float2 output_size = VIEWPORT_SIZE;
// float2 output_size = VIEWPORT_SIZE*NormalizedNativePixelSize/float2(320.0, 240.0);
// float2 output_size = float2(320.0, 240.0);
// float2 output_size = 1.0/NormalizedNativePixelSize;
// Get the uv sample distance between output pixels. Blurs are not generic
// Gaussian resizers, and correct blurs require:
// 1.) IN.output_size == IN.video_size * 2^m, where m is an integer <= 0.
// 2.) mipmap_inputN = "true" for this pass in .cgp preset if m != 0
// 3.) filter_linearN = "true" except for 1x scale nearest neighbor blurs
// Gaussian resizers would upsize using the distance between input texels
// (not output pixels), but we avoid this and consistently blur at the
// destination size. Otherwise, combining statically calculated weights
// with bilinear sample exploitation would result in terrible artifacts.
const float2 dxdy_scale = video_size/output_size;
const float2 dxdy = dxdy_scale/texture_size;
// This blur is horizontal-only, so zero out the vertical offset:
OUT.blur_dxdy = float2(dxdy.x, 0.0);
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Blur9Fast_Horizontal(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p4 VAR) : SV_Target
{
float3 color = tex2Dblur9fast(BLUR9FAST_VERTICAL, vTexCoord, VAR.blur_dxdy);
// Encode and output the blurred image:
return encode_output(float4(color, 1.0));
}

View file

@ -0,0 +1,95 @@
///////////////////////////////// MIT LICENSE ////////////////////////////////
// Copyright (C) 2014 TroggleMonkey
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
// PASS SETTINGS:
// gamma-management.h needs to know what kind of pipeline we're using and
// what pass this is in that pipeline. This will become obsolete if/when we
// can #define things like this in the .cgp preset file.
//#define GAMMA_ENCODE_EVERY_FBO
//#define FIRST_PASS
//#define LAST_PASS
//#define SIMULATE_CRT_ON_LCD
//#define SIMULATE_GBA_ON_LCD
//#define SIMULATE_LCD_ON_CRT
//#define SIMULATE_GBA_ON_CRT
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../include/gamma-management.fxh"
#include "../include/blur-functions.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p3
{
float2 blur_dxdy : TEXCOORD1;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Blur9Fast_Vertical(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p3 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
/*
float2 texture_size = 1.0/NormalizedNativePixelSize;
float2 output_size = (ViewportSize*BufferToViewportRatio);
float2 video_size = 1.0/NormalizedNativePixelSize;
*/
// float2 texture_size = float2(320.0, 240.0);
float2 texture_size = BLUR9FAST_VERTICAL_texture_size;
float2 output_size = VIEWPORT_SIZE;
// float2 output_size = VIEWPORT_SIZE/4.0;
// float2 output_size = VIEWPORT_SIZE*NormalizedNativePixelSize/float2(320.0, 240.0);
// float2 output_size = 1.0/NormalizedNativePixelSize;
// Get the uv sample distance between output pixels. Blurs are not generic
// Gaussian resizers, and correct blurs require:
// 1.) IN.output_size == IN.video_size * 2^m, where m is an integer <= 0.
// 2.) mipmap_inputN = "true" for this pass in .cgp preset if m != 0
// 3.) filter_linearN = "true" except for 1x scale nearest neighbor blurs
// Gaussian resizers would upsize using the distance between input texels
// (not output pixels), but we avoid this and consistently blur at the
// destination size. Otherwise, combining statically calculated weights
// with bilinear sample exploitation would result in terrible artifacts.
const float2 dxdy_scale = video_size/output_size;
const float2 dxdy = dxdy_scale/texture_size;
// This blur is vertical-only, so zero out the horizontal offset:
OUT.blur_dxdy = float2(0.0, dxdy.y);
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Blur9Fast_Vertical(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p3 VAR) : SV_Target
{
float3 color = tex2Dblur9fast(BLOOM_APPROX, vTexCoord, VAR.blur_dxdy);
// Encode and output the blurred image:
return encode_output(float4(color, 1.0));
}

View file

@ -0,0 +1,363 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////// INCLUDES //////////////////////////////////
#define ORIG_LINEARIZEDvideo_size VERTICAL_SCANLINES_texture_size
#define ORIG_LINEARIZEDtexture_size VERTICAL_SCANLINES_video_size
#define bloom_approx_scale_x (4.0/3.0)
static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
#include "../include/user-settings.fxh"
#include "../include/derived-settings-and-constants.fxh"
#include "../include/bind-shader-params.fxh"
#include "../include/gamma-management.fxh"
#include "../include/blur-functions.fxh"
#include "../include/scanline-functions.fxh"
#include "../include/bloom-functions.fxh"
/////////////////////////////////// HELPERS //////////////////////////////////
float3 tex2Dresize_gaussian4x4(const sampler2D tex, const float2 tex_uv,
const float2 dxdy, const float2 texture_size, const float2 texture_size_inv,
const float2 tex_uv_to_pixel_scale, const float sigma)
{
// Requires: 1.) All requirements of gamma-management.h must be satisfied!
// 2.) filter_linearN must == "true" in your .cgp preset.
// 3.) mipmap_inputN must == "true" in your .cgp preset if
// IN.output_size << SRC.video_size.
// 4.) dxdy should contain the uv pixel spacing:
// dxdy = max(float2(1.0),
// SRC.video_size/IN.output_size)/SRC.texture_size;
// 5.) texture_size == SRC.texture_size
// 6.) texture_size_inv == float2(1.0)/SRC.texture_size
// 7.) tex_uv_to_pixel_scale == IN.output_size *
// SRC.texture_size / SRC.video_size;
// 8.) sigma is the desired Gaussian standard deviation, in
// terms of output pixels. It should be < ~0.66171875 to
// ensure the first unused sample (outside the 4x4 box) has
// a weight < 1.0/256.0.
// Returns: A true 4x4 Gaussian resize of the input.
// Description:
// Given correct inputs, this Gaussian resizer samples 4 pixel locations
// along each downsized dimension and/or 4 texel locations along each
// upsized dimension. It computes dynamic weights based on the pixel-space
// distance of each sample from the destination pixel. It is arbitrarily
// resizable and higher quality than tex2Dblur3x3_resize, but it's slower.
// TODO: Move this to a more suitable file once there are others like it.
const float denom_inv = 0.5/(sigma*sigma);
// We're taking 4x4 samples, and we're snapping to texels for upsizing.
// Find texture coords for sample 5 (second row, second column):
const float2 curr_texel = tex_uv * texture_size;
const float2 prev_texel =
floor(curr_texel - under_half.xx) + 0.5.xx;
const float2 prev_texel_uv = prev_texel * texture_size_inv;
const float2 snap = float2(dxdy <= texture_size_inv);
const float2 sample5_downsize_uv = tex_uv - 0.5 * dxdy;
const float2 sample5_uv = lerp(sample5_downsize_uv, prev_texel_uv, snap);
// Compute texture coords for other samples:
const float2 dx = float2(dxdy.x, 0.0);
const float2 sample0_uv = sample5_uv - dxdy;
const float2 sample10_uv = sample5_uv + dxdy;
const float2 sample15_uv = sample5_uv + 2.0 * dxdy;
const float2 sample1_uv = sample0_uv + dx;
const float2 sample2_uv = sample0_uv + 2.0 * dx;
const float2 sample3_uv = sample0_uv + 3.0 * dx;
const float2 sample4_uv = sample5_uv - dx;
const float2 sample6_uv = sample5_uv + dx;
const float2 sample7_uv = sample5_uv + 2.0 * dx;
const float2 sample8_uv = sample10_uv - 2.0 * dx;
const float2 sample9_uv = sample10_uv - dx;
const float2 sample11_uv = sample10_uv + dx;
const float2 sample12_uv = sample15_uv - 3.0 * dx;
const float2 sample13_uv = sample15_uv - 2.0 * dx;
const float2 sample14_uv = sample15_uv - dx;
// Load each sample:
const float3 sample0 = tex2D_linearize(tex, sample0_uv).rgb;
const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
const float3 sample2 = tex2D_linearize(tex, sample2_uv).rgb;
const float3 sample3 = tex2D_linearize(tex, sample3_uv).rgb;
const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
const float3 sample5 = tex2D_linearize(tex, sample5_uv).rgb;
const float3 sample6 = tex2D_linearize(tex, sample6_uv).rgb;
const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
const float3 sample8 = tex2D_linearize(tex, sample8_uv).rgb;
const float3 sample9 = tex2D_linearize(tex, sample9_uv).rgb;
const float3 sample10 = tex2D_linearize(tex, sample10_uv).rgb;
const float3 sample11 = tex2D_linearize(tex, sample11_uv).rgb;
const float3 sample12 = tex2D_linearize(tex, sample12_uv).rgb;
const float3 sample13 = tex2D_linearize(tex, sample13_uv).rgb;
const float3 sample14 = tex2D_linearize(tex, sample14_uv).rgb;
const float3 sample15 = tex2D_linearize(tex, sample15_uv).rgb;
// Compute destination pixel offsets for each sample:
const float2 dest_pixel = tex_uv * tex_uv_to_pixel_scale;
const float2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel;
const float2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel;
// Compute Gaussian sample weights:
const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv);
const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv);
const float w2 = exp(-LENGTH_SQ(sample2_offset) * denom_inv);
const float w3 = exp(-LENGTH_SQ(sample3_offset) * denom_inv);
const float w4 = exp(-LENGTH_SQ(sample4_offset) * denom_inv);
const float w5 = exp(-LENGTH_SQ(sample5_offset) * denom_inv);
const float w6 = exp(-LENGTH_SQ(sample6_offset) * denom_inv);
const float w7 = exp(-LENGTH_SQ(sample7_offset) * denom_inv);
const float w8 = exp(-LENGTH_SQ(sample8_offset) * denom_inv);
const float w9 = exp(-LENGTH_SQ(sample9_offset) * denom_inv);
const float w10 = exp(-LENGTH_SQ(sample10_offset) * denom_inv);
const float w11 = exp(-LENGTH_SQ(sample11_offset) * denom_inv);
const float w12 = exp(-LENGTH_SQ(sample12_offset) * denom_inv);
const float w13 = exp(-LENGTH_SQ(sample13_offset) * denom_inv);
const float w14 = exp(-LENGTH_SQ(sample14_offset) * denom_inv);
const float w15 = exp(-LENGTH_SQ(sample15_offset) * denom_inv);
const float weight_sum_inv = 1.0/(
w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 +
w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15);
// Weight and sum the samples:
const float3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15;
return sum * weight_sum_inv;
}
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p2
{
float2 tex_uv : TEXCOORD1;
float2 blur_dxdy : TEXCOORD2;
float2 uv_scanline_step : TEXCOORD3;
float estimated_viewport_size_x : TEXCOORD4;
float2 texture_size_inv : TEXCOORD5;
float2 tex_uv_to_pixel_scale : TEXCOORD6;
float2 output_size : TEXCOORD7;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Bloom_Approx(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p2 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
float2 texture_size = BLOOM_APPROX_texture_size;
float2 output_size = VIEWPORT_SIZE;
OUT.output_size = output_size;
// This vertex shader copies blurs/vertex-shader-blur-one-pass-resize.h,
// except we're using a different source image.
const float2 video_uv = texcoord * texture_size/video_size;
OUT.tex_uv = video_uv * ORIG_LINEARIZEDvideo_size /
ORIG_LINEARIZEDtexture_size;
// The last pass (vertical scanlines) had a viewport y scale, so we can
// use it to calculate a better runtime sigma:
// OUT.estimated_viewport_size_x = video_size.y * geom_aspect_ratio_x/geom_aspect_ratio_y;
OUT.estimated_viewport_size_x = video_size.y * texture_size.x/texture_size.y;
// Get the uv sample distance between output pixels. We're using a resize
// blur, so arbitrary upsizing will be acceptable if filter_linearN =
// "true," and arbitrary downsizing will be acceptable if mipmap_inputN =
// "true" too. The blur will be much more accurate if a true 4x4 Gaussian
// resize is used instead of tex2Dblur3x3_resize (which samples between
// texels even for upsizing).
const float2 dxdy_min_scale = ORIG_LINEARIZEDvideo_size/output_size;
const float2 texture_size_inv = 1.0.xx/ORIG_LINEARIZEDtexture_size;
if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize
{
// For upsizing, we'll snap to texels and sample the nearest 4.
const float2 dxdy_scale = max(dxdy_min_scale, 1.0.xx);
OUT.blur_dxdy = dxdy_scale * texture_size_inv;
}
else
{
const float2 dxdy_scale = dxdy_min_scale;
OUT.blur_dxdy = dxdy_scale * texture_size_inv;
}
// tex2Dresize_gaussian4x4 needs to know a bit more than the other filters:
OUT.tex_uv_to_pixel_scale = output_size *
ORIG_LINEARIZEDtexture_size / ORIG_LINEARIZEDvideo_size;
OUT.texture_size_inv = texture_size_inv;
// Detecting interlacing again here lets us apply convergence offsets in
// this pass. il_step_multiple contains the (texel, scanline) step
// multiple: 1 for progressive, 2 for interlaced.
const float2 orig_video_size = ORIG_LINEARIZEDvideo_size;
const float y_step = 1.0 + float(is_interlaced(orig_video_size.y));
const float2 il_step_multiple = float2(1.0, y_step);
// Get the uv distance between (texels, same-field scanlines):
OUT.uv_scanline_step = il_step_multiple / ORIG_LINEARIZEDtexture_size;
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Bloom_Approx(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p2 VAR) : SV_Target
{
// Would a viewport-relative size work better for this pass? (No.)
// PROS:
// 1.) Instead of writing an absolute size to user-cgp-constants.h, we'd
// write a viewport scale. That number could be used to directly scale
// the viewport-resolution bloom sigma and/or triad size to a smaller
// scale. This way, we could calculate an optimal dynamic sigma no
// matter how the dot pitch is specified.
// CONS:
// 1.) Texel smearing would be much worse at small viewport sizes, but
// performance would be much worse at large viewport sizes, so there
// would be no easy way to calculate a decent scale.
// 2.) Worse, we could no longer get away with using a constant-size blur!
// Instead, we'd have to face all the same difficulties as the real
// phosphor bloom, which requires static #ifdefs to decide the blur
// size based on the expected triad size...a dynamic value.
// 3.) Like the phosphor bloom, we'd have less control over making the blur
// size correct for an optical blur. That said, we likely overblur (to
// maintain brightness) more than the eye would do by itself: 20/20
// human vision distinguishes ~1 arc minute, or 1/60 of a degree. The
// highest viewing angle recommendation I know of is THX's 40.04 degree
// recommendation, at which 20/20 vision can distinguish about 2402.4
// lines. Assuming the "TV lines" definition, that means 1201.2
// distinct light lines and 1201.2 distinct dark lines can be told
// apart, i.e. 1201.2 pairs of lines. This would correspond to 1201.2
// pairs of alternating lit/unlit phosphors, so 2402.4 phosphors total
// (if they're alternately lit). That's a max of 800.8 triads. Using
// a more popular 30 degree viewing angle recommendation, 20/20 vision
// can distinguish 1800 lines, or 600 triads of alternately lit
// phosphors. In contrast, we currently blur phosphors all the way
// down to 341.3 triads to ensure full brightness.
// 4.) Realistically speaking, we're usually just going to use bilinear
// filtering in this pass anyway, but it only works well to limit
// bandwidth if it's done at a small constant scale.
// Get the constants we need to sample:
float2 output_size = VAR.output_size;
//const sampler2D Source = ORIG_LINEARIZED;
const float2 tex_uv = VAR.tex_uv;
const float2 blur_dxdy = VAR.blur_dxdy;
const float2 texture_size = ORIG_LINEARIZEDtexture_size;
const float2 texture_size_inv = VAR.texture_size_inv;
const float2 tex_uv_to_pixel_scale = VAR.tex_uv_to_pixel_scale;
float2 tex_uv_r, tex_uv_g, tex_uv_b;
if(beam_misconvergence)
{
const float2 uv_scanline_step = VAR.uv_scanline_step;
const float2 convergence_offsets_r = get_convergence_offsets_r_vector();
const float2 convergence_offsets_g = get_convergence_offsets_g_vector();
const float2 convergence_offsets_b = get_convergence_offsets_b_vector();
tex_uv_r = tex_uv - convergence_offsets_r * uv_scanline_step;
tex_uv_g = tex_uv - convergence_offsets_g * uv_scanline_step;
tex_uv_b = tex_uv - convergence_offsets_b * uv_scanline_step;
}
// Get the blur sigma:
const float bloom_approx_sigma = get_bloom_approx_sigma(output_size.x,
VAR.estimated_viewport_size_x);
// Sample the resized and blurred texture, and apply convergence offsets if
// necessary. Applying convergence offsets here triples our samples from
// 16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and
// HALATION_BLUR 3 times at full resolution every time they're used.
float3 color_r, color_g, color_b, color;
if(bloom_approx_filter > 1.5)
{
// Use a 4x4 Gaussian resize. This is slower but technically correct.
if(beam_misconvergence)
{
color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r,
blur_dxdy, texture_size, texture_size_inv,
tex_uv_to_pixel_scale, bloom_approx_sigma);
color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g,
blur_dxdy, texture_size, texture_size_inv,
tex_uv_to_pixel_scale, bloom_approx_sigma);
color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b,
blur_dxdy, texture_size, texture_size_inv,
tex_uv_to_pixel_scale, bloom_approx_sigma);
}
else
{
color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv,
blur_dxdy, texture_size, texture_size_inv,
tex_uv_to_pixel_scale, bloom_approx_sigma);
}
}
else if(bloom_approx_filter > 0.5)
{
// Use a 3x3 resize blur. This is the softest option, because we're
// blurring already blurry bilinear samples. It doesn't play quite as
// nicely with convergence offsets, but it has its charms.
if(beam_misconvergence)
{
color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r,
blur_dxdy, bloom_approx_sigma);
color_g = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_g,
blur_dxdy, bloom_approx_sigma);
color_b = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_b,
blur_dxdy, bloom_approx_sigma);
}
else
{
color = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv, blur_dxdy);
}
}
else
{
// Use bilinear sampling. This approximates a 4x4 Gaussian resize MUCH
// better than tex2Dblur3x3_resize for the very small sigmas we're
// likely to use at small output resolutions. (This estimate becomes
// too sharp above ~400x300, but the blurs break down above that
// resolution too, unless min_allowed_viewport_triads is high enough to
// keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.)
if(beam_misconvergence)
{
color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb;
color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb;
color_b = tex2D_linearize(ORIG_LINEARIZED, tex_uv_b).rgb;
}
else
{
color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb;
}
}
// Pack the colors from the red/green/blue beams into a single vector:
if(beam_misconvergence)
{
color = float3(color_r.r, color_g.g, color_b.b);
}
// Encode and output the blurred image:
return encode_output(float4(color, 1.0));
}

View file

@ -0,0 +1,129 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "../include/user-settings.fxh"
#include "../include/derived-settings-and-constants.fxh"
#include "../include/bind-shader-params.fxh"
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../include/gamma-management.fxh"
#include "../include/bloom-functions.fxh"
#include "../include/phosphor-mask-resizing.fxh"
#include "../include/scanline-functions.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p10
{
float2 video_uv : TEXCOORD1;
float2 bloom_dxdy : TEXCOORD2;
float bloom_sigma_runtime : TEXCOORD3;
float2 sinangle : TEXCOORD4;
float2 cosangle : TEXCOORD5;
float3 stretch : TEXCOORD6;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Bloom_Horizontal(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p10 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
float2 texture_size = BLOOM_HORIZONTAL_texture_size;
float2 output_size = VIEWPORT_SIZE;
// Screen centering
texcoord = texcoord - float2(centerx,centery)/100.0;
float2 tex_uv = texcoord;
// Our various input textures use different coords:
const float2 video_uv = tex_uv * texture_size/video_size;
OUT.video_uv = video_uv;
// We're horizontally blurring the bloom input (vertically blurred
// brightpass). Get the uv distance between output pixels / input texels
// in the horizontal direction (this pass must NOT resize):
OUT.bloom_dxdy = float2(1.0/texture_size.x, 0.0);
// Calculate a runtime bloom_sigma in case it's needed:
const float mask_tile_size_x = get_resized_mask_tile_size(
output_size, output_size * mask_resize_viewport_scale, false).x;
OUT.bloom_sigma_runtime = get_min_sigma_to_blur_triad(
mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh);
// Precalculate a bunch of useful values we'll need in the fragment
// shader.
OUT.sinangle = sin(float2(geom_x_tilt, geom_y_tilt));
OUT.cosangle = cos(float2(geom_x_tilt, geom_y_tilt));
OUT.stretch = maxscale(OUT.sinangle, OUT.cosangle);
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Bloom_Horizontal(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p10 VAR) : SV_Target
{
VAR.video_uv = (geom_curvature == true) ? transform(VAR.video_uv, VAR.sinangle, VAR.cosangle, VAR.stretch) : VAR.video_uv;
float cval = corner((VAR.video_uv-0.5.xx) * BufferToViewportRatio + 0.5.xx);
// Blur the vertically blurred brightpass horizontally by 9/17/25/43x:
const float bloom_sigma = get_final_bloom_sigma(VAR.bloom_sigma_runtime);
const float3 blurred_brightpass = tex2DblurNfast(BLOOM_VERTICAL,
VAR.video_uv, VAR.bloom_dxdy, bloom_sigma);
// Sample the masked scanlines. Alpha contains the auto-dim factor:
const float3 intensity_dim =
tex2D_linearize(MASKED_SCANLINES, VAR.video_uv).rgb;
const float auto_dim_factor = levels_autodim_temp;
const float undim_factor = 1.0/auto_dim_factor;
// Calculate the mask dimpass, add it to the blurred brightpass, and
// undim (from scanline auto-dim) and amplify (from mask dim) the result:
const float mask_amplify = get_mask_amplify();
const float3 brightpass = tex2D_linearize(BRIGHTPASS,
VAR.video_uv).rgb;
const float3 dimpass = intensity_dim - brightpass;
const float3 phosphor_bloom = (dimpass + blurred_brightpass) *
mask_amplify * undim_factor * levels_contrast;
// Sample the halation texture, and let some light bleed into refractive
// diffusion. Conceptually this occurs before the phosphor bloom, but
// adding it in earlier passes causes black crush in the diffusion colors.
const float3 diffusion_color = levels_contrast * tex2D_linearize(
HALATION_BLUR, VAR.video_uv).rgb;
float3 final_bloom = lerp(phosphor_bloom,
diffusion_color, diffusion_weight);
final_bloom = (geom_curvature == true) ? final_bloom * cval.xxx : final_bloom;
final_bloom = pow(final_bloom.rgb, 1.0/get_output_gamma());
// Encode and output the bloomed image:
return encode_output(float4(final_bloom, 1.0));
}

View file

@ -0,0 +1,83 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "../include/user-settings.fxh"
#include "../include/derived-settings-and-constants.fxh"
#include "../include/bind-shader-params.fxh"
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../include/gamma-management.fxh"
#include "../include/bloom-functions.fxh"
#include "../include/phosphor-mask-resizing.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p9
{
float2 tex_uv : TEXCOORD1;
float2 bloom_dxdy : TEXCOORD2;
float bloom_sigma_runtime : TEXCOORD3;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Bloom_Vertical(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p9 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
float2 texture_size = BLOOM_VERTICAL_texture_size;
float2 output_size = VIEWPORT_SIZE;
OUT.tex_uv = texcoord;
// Get the uv sample distance between output pixels. Calculate dxdy like
// blurs/vertex-shader-blur-fast-vertical.h.
const float2 dxdy_scale = video_size/output_size;
const float2 dxdy = dxdy_scale/texture_size;
// This blur is vertical-only, so zero out the vertical offset:
OUT.bloom_dxdy = float2(0.0, dxdy.y);
// Calculate a runtime bloom_sigma in case it's needed:
const float mask_tile_size_x = get_resized_mask_tile_size(
output_size, output_size * mask_resize_viewport_scale, false).x;
OUT.bloom_sigma_runtime = get_min_sigma_to_blur_triad(
mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh);
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Bloom_Vertical(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p9 VAR) : SV_Target
{
// Blur the brightpass horizontally with a 9/17/25/43x blur:
const float bloom_sigma = get_final_bloom_sigma(VAR.bloom_sigma_runtime);
const float3 color = tex2DblurNfast(BRIGHTPASS, VAR.tex_uv,
VAR.bloom_dxdy, bloom_sigma);
// Encode and output the blurred image:
return encode_output(float4(color, 1.0));
}

View file

@ -0,0 +1,130 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "../include/user-settings.fxh"
#include "../include/derived-settings-and-constants.fxh"
#include "../include/bind-shader-params.fxh"
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../include/gamma-management.fxh"
#include "../include/blur-functions.fxh"
#include "../include/phosphor-mask-resizing.fxh"
#include "../include/scanline-functions.fxh"
#include "../include/bloom-functions.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p8
{
float2 video_uv : TEXCOORD1;
float2 scanline_tex_uv : TEXCOORD2;
float2 blur3x3_tex_uv : TEXCOORD3;
float bloom_sigma_runtime : TEXCOORD4;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Brightpass(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p8 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
float2 tex_uv = texcoord;
float2 texture_size = BRIGHTPASS_texture_size;
float2 output_size = VIEWPORT_SIZE;
// Our various input textures use different coords:
const float2 video_uv = tex_uv * texture_size/video_size;
OUT.video_uv = video_uv;
OUT.scanline_tex_uv = video_uv * MASKED_SCANLINES_video_size /
MASKED_SCANLINES_texture_size;
OUT.blur3x3_tex_uv = video_uv * BLOOM_APPROX_video_size / BLOOM_APPROX_texture_size;
// Calculate a runtime bloom_sigma in case it's needed:
const float mask_tile_size_x = get_resized_mask_tile_size(
output_size, output_size * mask_resize_viewport_scale, false).x;
OUT.bloom_sigma_runtime = get_min_sigma_to_blur_triad(
mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh);
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Brightpass(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p8 VAR) : SV_Target
{
// Sample the masked scanlines:
const float3 intensity_dim =
tex2D_linearize(MASKED_SCANLINES, VAR.scanline_tex_uv).rgb;
// Get the full intensity, including auto-undimming, and mask compensation:
const float auto_dim_factor = levels_autodim_temp;
const float undim_factor = 1.0/auto_dim_factor;
const float mask_amplify = get_mask_amplify();
const float3 intensity = intensity_dim * undim_factor * mask_amplify *
levels_contrast;
// Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines
// would look like, so we can estimate how much energy we'll receive from
// blooming neighbors:
const float3 phosphor_blur_approx = levels_contrast * tex2D_linearize(
BLOOM_APPROX, VAR.blur3x3_tex_uv).rgb;
// Compute the blur weight for the center texel and the maximum energy we
// expect to receive from neighbors:
const float bloom_sigma = get_final_bloom_sigma(VAR.bloom_sigma_runtime);
const float center_weight = get_center_weight(bloom_sigma);
const float3 max_area_contribution_approx =
max(0.0.xxx, phosphor_blur_approx - center_weight * intensity);
// Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0),
// because it actually gets better results (on top of being very simple),
// but adjust all intensities for the user's desired underestimate factor:
const float3 area_contrib_underestimate =
bloom_underestimate_levels * max_area_contribution_approx;
const float3 intensity_underestimate =
bloom_underestimate_levels * intensity;
// Calculate the blur_ratio, the ratio of intensity we want to blur:
#ifdef BRIGHTPASS_AREA_BASED
// This area-based version changes blur_ratio more smoothly and blurs
// more, clipping less but offering less phosphor differentiation:
const float3 phosphor_blur_underestimate = bloom_underestimate_levels *
phosphor_blur_approx;
const float3 soft_intensity = max(intensity_underestimate,
phosphor_blur_underestimate * mask_amplify);
const float3 blur_ratio_temp =
((1.0.xxx - area_contrib_underestimate) /
soft_intensity - 1.0.xxx) / (center_weight - 1.0);
#else
const float3 blur_ratio_temp =
((1.0.xxx - area_contrib_underestimate) /
intensity_underestimate - 1.0.xxx) / (center_weight - 1.0);
#endif
const float3 blur_ratio = clamp(blur_ratio_temp, 0.0, 1.0);
// Calculate the brightpass based on the auto-dimmed, unamplified, masked
// scanlines, encode if necessary, and return!
const float3 brightpass = intensity_dim *
lerp(blur_ratio, 1.0.xxx, bloom_excess);
return encode_output(float4(brightpass, 1.0));
}

View file

@ -0,0 +1,109 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
// PASS SETTINGS:
// gamma-management.h needs to know what kind of pipeline we're using and
// what pass this is in that pipeline. This will become obsolete if/when we
// can #define things like this in the .cgp preset file.
#define FIRST_PASS
#define SIMULATE_CRT_ON_LCD
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../include/user-settings.fxh"
#include "../include/bind-shader-params.fxh"
#include "../include/gamma-management.fxh"
#include "../include/scanline-functions.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex
{
float2 tex_uv : TEXCOORD1;
float2 uv_step : TEXCOORD2;
float interlaced : TEXCOORD3;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Linearize(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
OUT.tex_uv = texcoord;
// OUT.tex_uv = (floor(texcoord / NormalizedNativePixelSize)+float2(0.5,0.5)) * NormalizedNativePixelSize;
// Save the uv distance between texels:
OUT.uv_step = NormalizedNativePixelSize;
// Detect interlacing: 1.0 = true, 0.0 = false.
OUT.interlaced = is_interlaced(1.0/NormalizedNativePixelSize.y);
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
sampler2D sBackBuffer{Texture=ReShade::BackBufferTex;AddressU=BORDER;AddressV=BORDER;AddressW=BORDER;MagFilter=POINT;MinFilter=POINT;};
#define input_texture sBackBuffer
float4 PS_Linearize(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex VAR) : SV_Target
{
// Linearize the input based on CRT gamma and bob interlaced fields.
// Bobbing ensures we can immediately blur without getting artifacts.
// Note: TFF/BFF won't matter for sources that double-weave or similar.
// VAR.tex_uv = (floor(VAR.tex_uv / NormalizedNativePixelSize)+float2(0.5,0.5)) * NormalizedNativePixelSize;
if(interlace_detect)
{
// Sample the current line and an average of the previous/next line;
// tex2D_linearize will decode CRT gamma. Don't bother branching:
const float2 tex_uv = VAR.tex_uv;
const float2 v_step = float2(0.0, VAR.uv_step.y);
const float3 curr_line = tex2D_linearize_first(
input_texture, tex_uv).rgb;
const float3 last_line = tex2D_linearize_first(
input_texture, tex_uv - v_step).rgb;
const float3 next_line = tex2D_linearize_first(
input_texture, tex_uv + v_step).rgb;
const float3 interpolated_line = 0.5 * (last_line + next_line);
// If we're interlacing, determine which field curr_line is in:
const float modulus = VAR.interlaced + 1.0;
const float field_offset =
fmod(FrameCount + float(interlace_bff), modulus);
const float curr_line_texel = tex_uv.y / NormalizedNativePixelSize.y;
// Use under_half to fix a rounding bug around exact texel locations.
const float line_num_last = floor(curr_line_texel - under_half);
const float wrong_field = fmod(line_num_last + field_offset, modulus);
// Select the correct color, and output the result:
const float3 color = lerp(curr_line, interpolated_line, wrong_field);
return encode_output(float4(color, 1.0));
}
else
{
return encode_output(tex2D_linearize_first(input_texture, VAR.tex_uv));
}
}

View file

@ -0,0 +1,130 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "../include/user-settings.fxh"
#include "../include/derived-settings-and-constants.fxh"
#include "../include/bind-shader-params.fxh"
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../include/phosphor-mask-resizing.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p6
{
float2 src_tex_uv_wrap : TEXCOORD1;
float2 tile_uv_wrap : TEXCOORD2;
float2 resize_magnification_scale : TEXCOORD3;
float2 src_dxdy : TEXCOORD4;
float2 tile_size_uv : TEXCOORD5;
float2 input_tiles_per_texture : TEXCOORD6;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Mask_Resize_Horizontal(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p6 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
float2 tex_uv = texcoord;
float2 texture_size = MASK_RESIZE_texture_size;
float2 output_size = 0.0625*(VIEWPORT_SIZE);
// First estimate the viewport size (the user will get the wrong number of
// triads if it's wrong and mask_specify_num_triads is 1.0/true).
const float2 estimated_viewport_size =
output_size / mask_resize_viewport_scale;
// Find the final size of our resized phosphor mask tiles. We probably
// estimated the viewport size and MASK_RESIZE output size differently last
// pass, so do not swear they were the same. ;)
const float2 mask_resize_tile_size = get_resized_mask_tile_size(
estimated_viewport_size, output_size, false);
// We'll render resized tiles until filling the output FBO or meeting a
// limit, so compute [wrapped] tile uv coords based on the output uv coords
// and the number of tiles that will fit in the FBO.
const float2 output_tiles_this_pass = output_size / mask_resize_tile_size;
const float2 output_video_uv = tex_uv * texture_size / video_size;
const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass;
// Get the texel size of an input tile and related values:
const float2 input_tile_size = float2(min(
mask_resize_src_lut_size.x, video_size.x), mask_resize_tile_size.y);
const float2 tile_size_uv = input_tile_size / texture_size;
const float2 input_tiles_per_texture = texture_size / input_tile_size;
// Derive [wrapped] texture uv coords from [wrapped] tile uv coords and
// the tile size in uv coords, and save frac() for the fragment shader.
const float2 src_tex_uv_wrap = tile_uv_wrap * tile_size_uv;
// Output the values we need, including the magnification scale and step:
OUT.tile_uv_wrap = tile_uv_wrap;
OUT.src_tex_uv_wrap = src_tex_uv_wrap;
OUT.resize_magnification_scale = mask_resize_tile_size / input_tile_size;
OUT.src_dxdy = float2(1.0/texture_size.x, 0.0);
OUT.tile_size_uv = tile_size_uv;
OUT.input_tiles_per_texture = input_tiles_per_texture;
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Mask_Resize_Horizontal(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p6 VAR) : SV_Target
{
// The input contains one mask tile horizontally and a number vertically.
// Resize the tile horizontally to its final screen size and repeat it
// until drawing at least mask_resize_num_tiles, leaving it unchanged
// vertically. Lanczos-resizing the phosphor mask achieves much sharper
// results than mipmapping, outputting >= mask_resize_num_tiles makes for
// easier tiled sampling later.
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
// Discard unneeded fragments in case our profile allows real branches.
float2 texture_size = MASK_RESIZE_texture_size;
const float2 tile_uv_wrap = VAR.tile_uv_wrap;
if(get_mask_sample_mode() < 0.5 &&
max(tile_uv_wrap.x, tile_uv_wrap.y) <= mask_resize_num_tiles)
{
const float src_dx = VAR.src_dxdy.x;
const float2 src_tex_uv = frac(VAR.src_tex_uv_wrap);
const float3 pixel_color = downsample_horizontal_sinc_tiled(MASK_RESIZE_VERTICAL,
src_tex_uv, texture_size, VAR.src_dxdy.x,
VAR.resize_magnification_scale.x, VAR.tile_size_uv.x);
// The input LUT was linear RGB, and so is our output:
return float4(pixel_color, 1.0);
}
else
{
discard;
}
#else
discard;
return 1.0.xxxx;
#endif
}

View file

@ -0,0 +1,164 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "../include/user-settings.fxh"
#include "../include/derived-settings-and-constants.fxh"
#include "../include/bind-shader-params.fxh"
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../include/phosphor-mask-resizing.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p5
{
float2 src_tex_uv_wrap : TEXCOORD1;
float2 resize_magnification_scale : TEXCOORD2;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Mask_Resize_Vertical(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p5 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
float2 tex_uv = texcoord;
float2 texture_size = MASK_RESIZE_VERT_texture_size;
float2 output_size = float2(64.0, 0.0625*((VIEWPORT_SIZE).y));
// First estimate the viewport size (the user will get the wrong number of
// triads if it's wrong and mask_specify_num_triads is 1.0/true).
const float viewport_y = output_size.y / mask_resize_viewport_scale.y;
// Now get aspect_ratio from texture_size.
// const float aspect_ratio = geom_aspect_ratio_x / geom_aspect_ratio_y;
const float aspect_ratio = texture_size.x / texture_size.y;
const float2 estimated_viewport_size =
float2(viewport_y * aspect_ratio, viewport_y);
// Estimate the output size of MASK_RESIZE (the next pass). The estimated
// x component shouldn't matter, because we're not using the x result, and
// we're not swearing it's correct (if we did, the x result would influence
// the y result to maintain the tile aspect ratio).
const float2 estimated_mask_resize_output_size =
float2(output_size.y * aspect_ratio, output_size.y);
// Find the final intended [y] size of our resized phosphor mask tiles,
// then the tile size for the current pass (resize y only):
const float2 mask_resize_tile_size = get_resized_mask_tile_size(
estimated_viewport_size, estimated_mask_resize_output_size, false);
const float2 pass_output_tile_size = float2(min(
mask_resize_src_lut_size.x, output_size.x), mask_resize_tile_size.y);
// We'll render resized tiles until filling the output FBO or meeting a
// limit, so compute [wrapped] tile uv coords based on the output uv coords
// and the number of tiles that will fit in the FBO.
const float2 output_tiles_this_pass = output_size / pass_output_tile_size;
const float2 output_video_uv = tex_uv * texture_size / video_size;
const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass;
// The input LUT is just a single mask tile, so texture uv coords are the
// same as tile uv coords (save frac() for the fragment shader). The
// magnification scale is also straightforward:
OUT.src_tex_uv_wrap = tile_uv_wrap;
OUT.resize_magnification_scale =
pass_output_tile_size / mask_resize_src_lut_size;
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Mask_Resize_Vertical(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p5 VAR) : SV_Target
{
// Resize the input phosphor mask tile to the final vertical size it will
// appear on screen. Keep 1x horizontal size if possible (IN.output_size
// >= mask_resize_src_lut_size), and otherwise linearly sample horizontally
// to fit exactly one tile. Lanczos-resizing the phosphor mask achieves
// much sharper results than mipmapping, and vertically resizing first
// minimizes the total number of taps required. We output a number of
// resized tiles >= mask_resize_num_tiles for easier tiled sampling later.
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
// Discard unneeded fragments in case our profile allows real branches.
const float2 tile_uv_wrap = VAR.src_tex_uv_wrap;
if(get_mask_sample_mode() < 0.5 &&
tile_uv_wrap.y <= mask_resize_num_tiles)
{
static const float src_dy = 1.0/mask_resize_src_lut_size.y;
const float2 src_tex_uv = frac(VAR.src_tex_uv_wrap);
float3 pixel_color;
// If mask_type is static, this branch will be resolved statically.
#ifdef PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
if(mask_type < 0.5)
{
pixel_color = downsample_vertical_sinc_tiled(
mask_grille_texture_large, src_tex_uv, mask_resize_src_lut_size,
src_dy, VAR.resize_magnification_scale.y, 1.0);
}
else if(mask_type < 1.5)
{
pixel_color = downsample_vertical_sinc_tiled(
mask_slot_texture_large, src_tex_uv, mask_resize_src_lut_size,
src_dy, VAR.resize_magnification_scale.y, 1.0);
}
else
{
pixel_color = downsample_vertical_sinc_tiled(
mask_shadow_texture_large, src_tex_uv, mask_resize_src_lut_size,
src_dy, VAR.resize_magnification_scale.y, 1.0);
}
#else
if(mask_type < 0.5)
{
pixel_color = downsample_vertical_sinc_tiled(
mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size,
src_dy, VAR.resize_magnification_scale.y, 1.0);
}
else if(mask_type < 1.5)
{
pixel_color = downsample_vertical_sinc_tiled(
mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size,
src_dy, VAR.resize_magnification_scale.y, 1.0);
}
else
{
pixel_color = downsample_vertical_sinc_tiled(
mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size,
src_dy, VAR.resize_magnification_scale.y, 1.0);
}
#endif
// The input LUT was linear RGB, and so is our output:
return float4(pixel_color, 1.0);
}
else
{
discard;
}
#else
discard;
return 1.0.xxxx;
#endif
}

View file

@ -0,0 +1,283 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "../include/user-settings.fxh"
#include "../include/derived-settings-and-constants.fxh"
#include "../include/bind-shader-params.fxh"
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../include/scanline-functions.fxh"
#include "../include/phosphor-mask-resizing.fxh"
#include "../include/bloom-functions.fxh"
#include "../include/gamma-management.fxh"
/////////////////////////////////// HELPERS //////////////////////////////////
float4 tex2Dtiled_mask_linearize(const sampler2D tex,
const float2 tex_uv)
{
// If we're manually tiling a texture, anisotropic filtering can get
// confused. One workaround is to just select the lowest mip level:
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
// TODO: Use tex2Dlod_linearize with a calculated mip level.
return tex2Dlod_linearize(tex, float4(tex_uv, 0.0, 0.0));
#else
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0));
#else
return tex2D_linearize(tex, tex_uv);
#endif
#endif
#else
return tex2D_linearize(tex, tex_uv);
#endif
}
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p7
{
// Use explicit semantics so COLORx doesn't clamp values outside [0, 1].
float2 video_uv : TEXCOORD1;
float2 scanline_tex_uv : TEXCOORD2;
float2 blur3x3_tex_uv : TEXCOORD3;
float2 halation_tex_uv : TEXCOORD4;
float2 scanline_texture_size_inv : TEXCOORD5;
float4 mask_tile_start_uv_and_size : TEXCOORD6;
float2 mask_tiles_per_screen : TEXCOORD7;
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Scanlines_Horizontal_Apply_Mask(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p7 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
float2 tex_uv = texcoord;
float2 texture_size = MASKED_SCANLINES_texture_size;
float2 output_size = VIEWPORT_SIZE;
// Our various input textures use different coords.
const float2 video_uv = tex_uv * texture_size/video_size;
const float2 scanline_texture_size_inv =
1.0.xx/VERTICAL_SCANLINES_texture_size;
OUT.video_uv = video_uv;
OUT.scanline_tex_uv = video_uv * VERTICAL_SCANLINES_video_size *
scanline_texture_size_inv;
OUT.blur3x3_tex_uv = video_uv * BLOOM_APPROX_video_size /
BLOOM_APPROX_texture_size;
OUT.halation_tex_uv = video_uv * HALATION_BLUR_video_size /
HALATION_BLUR_texture_size;
OUT.scanline_texture_size_inv = scanline_texture_size_inv;
// Get a consistent name for the final mask texture size. Sample mode 0
// uses the manually resized mask, but ignore it if we never resized.
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
const float mask_sample_mode = get_mask_sample_mode();
const float2 mask_resize_texture_size = mask_sample_mode < 0.5 ?
MASKED_SCANLINES_texture_size : mask_texture_large_size;
const float2 mask_resize_video_size = mask_sample_mode < 0.5 ?
MASKED_SCANLINES_video_size : mask_texture_large_size;
#else
const float2 mask_resize_texture_size = mask_texture_large_size;
const float2 mask_resize_video_size = mask_texture_large_size;
#endif
// Compute mask tile dimensions, starting points, etc.:
float2 mask_tiles_per_screen;
OUT.mask_tile_start_uv_and_size = get_mask_sampling_parameters(
mask_resize_texture_size, mask_resize_video_size, output_size,
mask_tiles_per_screen);
OUT.mask_tiles_per_screen = mask_tiles_per_screen;
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Scanlines_Horizontal_Apply_Mask(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p7 VAR) : SV_Target
{
// This pass: Sample (misconverged?) scanlines to the final horizontal
// resolution, apply halation (bouncing electrons), and apply the phosphor
// mask. Fake a bloom if requested. Unless we fake a bloom, the output
// will be dim from the scanline auto-dim, mask dimming, and low gamma.
// Horizontally sample the current row (a vertically interpolated scanline)
// and account for horizontal convergence offsets, given in units of texels.
// float2 VERTICAL_SCANLINES_texture_size = float2(1.0/NormalizedNativePixelSize.x, ViewportSize.y*BufferToViewportRatio.y);
float2 output_size = VIEWPORT_SIZE;
const float3 scanline_color_dim = sample_rgb_scanline_horizontal(
VERTICAL_SCANLINES, VAR.scanline_tex_uv,
VERTICAL_SCANLINES_texture_size, VAR.scanline_texture_size_inv);
const float auto_dim_factor = levels_autodim_temp;
// Sample the phosphor mask:
const float2 tile_uv_wrap = VAR.video_uv * VAR.mask_tiles_per_screen;
const float2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv(
tile_uv_wrap, VAR.mask_tile_start_uv_and_size);
float3 phosphor_mask_sample;
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
const bool sample_orig_luts = get_mask_sample_mode() > 0.5;
#else
static const bool sample_orig_luts = true;
#endif
if(sample_orig_luts)
{
// If mask_type is static, this branch will be resolved statically.
if(mask_type < 0.5)
{
phosphor_mask_sample = tex2D_linearize(
mask_grille_texture_large, mask_tex_uv).rgb;
}
else if(mask_type < 1.5)
{
phosphor_mask_sample = tex2D_linearize(
mask_slot_texture_large, mask_tex_uv).rgb;
}
else
{
phosphor_mask_sample = tex2D_linearize(
mask_shadow_texture_large, mask_tex_uv).rgb;
}
}
else
{
// Sample the resized mask, and avoid tiling artifacts:
phosphor_mask_sample = tex2Dtiled_mask_linearize(
MASK_RESIZE, mask_tex_uv).rgb;
}
// Sample the halation texture (auto-dim to match the scanlines), and
// account for both horizontal and vertical convergence offsets, given
// in units of texels horizontally and same-field scanlines vertically:
const float3 halation_color = tex2D_linearize(
HALATION_BLUR, VAR.halation_tex_uv).rgb;
// Apply halation: Halation models electrons flying around under the glass
// and hitting the wrong phosphors (of any color). It desaturates, so
// average the halation electrons to a scalar. Reduce the local scanline
// intensity accordingly to conserve energy.
const float3 halation_intensity_dim =
dot(halation_color, auto_dim_factor.xxx/3.0).xxx;
const float3 electron_intensity_dim = lerp(scanline_color_dim,
halation_intensity_dim, halation_weight);
// Apply the phosphor mask:
const float3 phosphor_emission_dim = electron_intensity_dim *
phosphor_mask_sample;
#ifdef PHOSPHOR_BLOOM_FAKE
// The BLOOM_APPROX pass approximates a blurred version of a masked
// and scanlined image. It's usually used to compute the brightpass,
// but we can also use it to fake the bloom stage entirely. Caveats:
// 1.) A fake bloom is conceptually different, since we're mixing in a
// fully blurred low-res image, and the biggest implication are:
// 2.) If mask_amplify is incorrect, results deteriorate more quickly.
// 3.) The inaccurate blurring hurts quality in high-contrast areas.
// 4.) The bloom_underestimate_levels parameter seems less sensitive.
// Reverse the auto-dimming and amplify to compensate for mask dimming:
#define PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
#ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
static const float blur_contrast = 1.05;
#else
static const float blur_contrast = 1.0;
#endif
const float mask_amplify = get_mask_amplify();
const float undim_factor = 1.0/auto_dim_factor;
const float3 phosphor_emission =
phosphor_emission_dim * undim_factor * mask_amplify;
// Get a phosphor blur estimate, accounting for convergence offsets:
const float3 electron_intensity = electron_intensity_dim * undim_factor;
const float3 phosphor_blur_approx_soft = tex2D_linearize(
BLOOM_APPROX, VAR.blur3x3_tex_uv).rgb;
const float3 phosphor_blur_approx = lerp(phosphor_blur_approx_soft,
electron_intensity, 0.1) * blur_contrast;
// We could blend between phosphor_emission and phosphor_blur_approx,
// solving for the minimum blend_ratio that avoids clipping past 1.0:
// 1.0 >= total_intensity
// 1.0 >= phosphor_emission * (1.0 - blend_ratio) +
// phosphor_blur_approx * blend_ratio
// blend_ratio = (phosphor_emission - 1.0)/
// (phosphor_emission - phosphor_blur_approx);
// However, this blurs far more than necessary, because it aims for
// full brightness, not minimal blurring. To fix it, base blend_ratio
// on a max area intensity only so it varies more smoothly:
const float3 phosphor_blur_underestimate =
phosphor_blur_approx * bloom_underestimate_levels;
const float3 area_max_underestimate =
phosphor_blur_underestimate * mask_amplify;
#ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
const float3 blend_ratio_temp =
(area_max_underestimate - 1.0.xxx) /
(area_max_underestimate - phosphor_blur_underestimate);
#else
// Try doing it like an area-based brightpass. This is nearly
// identical, but it's worth toying with the code in case I ever
// find a way to make it look more like a real bloom. (I've had
// some promising textures from combining an area-based blend ratio
// for the phosphor blur and a more brightpass-like blend-ratio for
// the phosphor emission, but I haven't found a way to make the
// brightness correct across the whole color range, especially with
// different bloom_underestimate_levels values.)
const float desired_triad_size = lerp(mask_triad_size_desired,
output_size.x/mask_num_triads_desired,
mask_specify_num_triads);
const float bloom_sigma = get_min_sigma_to_blur_triad(
desired_triad_size, bloom_diff_thresh);
const float center_weight = get_center_weight(bloom_sigma);
const float3 max_area_contribution_approx =
max(0.0.xxx, phosphor_blur_approx -
center_weight * phosphor_emission);
const float3 area_contrib_underestimate =
bloom_underestimate_levels * max_area_contribution_approx;
const float3 blend_ratio_temp =
((1.0.xxx - area_contrib_underestimate) /
area_max_underestimate - 1.0.xxx) / (center_weight - 1.0);
#endif
// Clamp blend_ratio in case it's out-of-range, but be SUPER careful:
// min/max/clamp are BIZARRELY broken with lerp (optimization bug?),
// and this redundant sequence avoids bugs, at least on nVidia cards:
const float3 blend_ratio_clamped = max(clamp(blend_ratio_temp, 0.0, 1.0), 0.0);
const float3 blend_ratio = lerp(blend_ratio_clamped, 1.0.xxx, bloom_excess);
// Blend the blurred and unblurred images:
const float3 phosphor_emission_unclipped =
lerp(phosphor_emission, phosphor_blur_approx, blend_ratio);
// Simulate refractive diffusion by reusing the halation sample.
const float3 pixel_color = lerp(phosphor_emission_unclipped,
halation_color, diffusion_weight);
#else
const float3 pixel_color = phosphor_emission_dim;
#endif
// Encode if necessary, and output.
return encode_output(float4(pixel_color, 1.0));
}

View file

@ -0,0 +1,241 @@
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
#undef FIRST_PASS
////////////////////////////////// INCLUDES //////////////////////////////////
//#include "../include/user-settings.fxh"
//#include "../include/derived-settings-and-constants.fxh"
#include "../include/bind-shader-params.fxh"
#include "../include/scanline-functions.fxh"
//#include "../include/gamma-management.fxh"
///////////////////////////////// STRUCTURES /////////////////////////////////
struct out_vertex_p1
{
// Use explicit semantics so COLORx doesn't clamp values outside [0, 1].
float2 tex_uv : TEXCOORD1;
float2 uv_step : TEXCOORD2; // uv size of a texel (x) and scanline (y)
float2 il_step_multiple : TEXCOORD3; // (1, 1) = progressive, (1, 2) = interlaced
float pixel_height_in_scanlines : TEXCOORD4; // Height of an output pixel in scanlines
};
//////////////////////////////// VERTEX SHADER ///////////////////////////////
// Vertex shader generating a triangle covering the entire screen
void VS_Scanlines_Vertical_Interlacing(in uint id : SV_VertexID, out float4 position : SV_Position, out float2 texcoord : TEXCOORD, out out_vertex_p1 OUT)
{
texcoord.x = (id == 2) ? 2.0 : 0.0;
texcoord.y = (id == 1) ? 2.0 : 0.0;
position = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
OUT.tex_uv = texcoord;
float2 texture_size = VERTICAL_SCANLINES_texture_size;
float2 output_size = float2(TEXTURE_SIZE.x, VIEWPORT_SIZE.y);
// Detect interlacing: il_step_multiple indicates the step multiple between
// lines: 1 is for progressive sources, and 2 is for interlaced sources.
// const float2 video_size = 1.0/NormalizedNativePixelSize;
const float y_step = 1.0 + float(is_interlaced(video_size.y));
OUT.il_step_multiple = float2(1.0, y_step);
// Get the uv tex coords step between one texel (x) and scanline (y):
OUT.uv_step = OUT.il_step_multiple / texture_size;
// If shader parameters are used, {min, max}_{sigma, shape} are runtime
// values. Compute {sigma, shape}_range outside of scanline_contrib() so
// they aren't computed once per scanline (6 times per fragment and up to
// 18 times per vertex):
/* const float sigma_range = max(beam_max_sigma, beam_min_sigma) -
beam_min_sigma;
const float shape_range = max(beam_max_shape, beam_min_shape) -
beam_min_shape;
*/
// We need the pixel height in scanlines for antialiased/integral sampling:
const float ph = (video_size.y / output_size.y) /
OUT.il_step_multiple.y;
OUT.pixel_height_in_scanlines = ph;
}
/////////////////////////////// FRAGMENT SHADER //////////////////////////////
float4 PS_Scanlines_Vertical_Interlacing(float4 vpos: SV_Position, float2 vTexCoord : TEXCOORD, in out_vertex_p1 VAR) : SV_Target
{
// This pass: Sample multiple (misconverged?) scanlines to the final
// vertical resolution. Temporarily auto-dim the output to avoid clipping.
// Read some attributes into local variables:
const float2 texture_size = VERTICAL_SCANLINES_texture_size;
const float2 texture_size_inv = 1.0/texture_size;
const float2 uv_step = VAR.uv_step;
const float2 il_step_multiple = VAR.il_step_multiple;
const float frame_count = FrameCount;
const float ph = VAR.pixel_height_in_scanlines;
// Get the uv coords of the previous scanline (in this field), and the
// scanline's distance from this sample, in scanlines.
float dist;
const float2 scanline_uv = get_last_scanline_uv(VAR.tex_uv, texture_size,
texture_size_inv, il_step_multiple, frame_count, dist);
// Consider 2, 3, 4, or 6 scanlines numbered 0-5: The previous and next
// scanlines are numbered 2 and 3. Get scanline colors colors (ignore
// horizontal sampling, since since IN.output_size.x = video_size.x).
// NOTE: Anisotropic filtering creates interlacing artifacts, which is why
// ORIG_LINEARIZED bobbed any interlaced input before this pass.
const float2 v_step = float2(0.0, uv_step.y);
const float3 scanline2_color = tex2D_linearize(ORIG_LINEARIZED, scanline_uv).rgb;
const float3 scanline3_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv + v_step).rgb;
float3 scanline0_color, scanline1_color, scanline4_color, scanline5_color,
scanline_outside_color;
float dist_round;
// Use scanlines 0, 1, 4, and 5 for a total of 6 scanlines:
if(beam_num_scanlines > 5.5)
{
scanline1_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv - v_step).rgb;
scanline4_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv + 2.0 * v_step).rgb;
scanline0_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv - 2.0 * v_step).rgb;
scanline5_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv + 3.0 * v_step).rgb;
}
// Use scanlines 1, 4, and either 0 or 5 for a total of 5 scanlines:
else if(beam_num_scanlines > 4.5)
{
scanline1_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv - v_step).rgb;
scanline4_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv + 2.0 * v_step).rgb;
// dist is in [0, 1]
dist_round = round(dist);
const float2 sample_0_or_5_uv_off =
lerp(-2.0 * v_step, 3.0 * v_step, dist_round);
// Call this "scanline_outside_color" to cope with the conditional
// scanline number:
scanline_outside_color = tex2D_linearize(
ORIG_LINEARIZED, scanline_uv + sample_0_or_5_uv_off).rgb;
}
// Use scanlines 1 and 4 for a total of 4 scanlines:
else if(beam_num_scanlines > 3.5)
{
scanline1_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv - v_step).rgb;
scanline4_color =
tex2D_linearize(ORIG_LINEARIZED, scanline_uv + 2.0 * v_step).rgb;
}
// Use scanline 1 or 4 for a total of 3 scanlines:
else if(beam_num_scanlines > 2.5)
{
// dist is in [0, 1]
dist_round = round(dist);
const float2 sample_1or4_uv_off =
lerp(-v_step, 2.0 * v_step, dist_round);
scanline_outside_color = tex2D_linearize(
ORIG_LINEARIZED, scanline_uv + sample_1or4_uv_off).rgb;
}
// Compute scanline contributions, accounting for vertical convergence.
// Vertical convergence offsets are in units of current-field scanlines.
// dist2 means "positive sample distance from scanline 2, in scanlines:"
float3 dist2 = dist.xxx;
if(beam_misconvergence)
{
const float3 convergence_offsets_vert_rgb =
get_convergence_offsets_y_vector();
dist2 = dist.xxx - convergence_offsets_vert_rgb;
}
// Calculate {sigma, shape}_range outside of scanline_contrib so it's only
// done once per pixel (not 6 times) with runtime params. Don't reuse the
// vertex shader calculations, so static versions can be constant-folded.
const float sigma_range = max(beam_max_sigma, beam_min_sigma) -
beam_min_sigma;
const float shape_range = max(beam_max_shape, beam_min_shape) -
beam_min_shape;
// Calculate and sum final scanline contributions, starting with lines 2/3.
// There is no normalization step, because we're not interpolating a
// continuous signal. Instead, each scanline is an additive light source.
const float3 scanline2_contrib = scanline_contrib(dist2,
scanline2_color, ph, sigma_range, shape_range);
const float3 scanline3_contrib = scanline_contrib(abs(1.0.xxx - dist2),
scanline3_color, ph, sigma_range, shape_range);
float3 scanline_intensity = scanline2_contrib + scanline3_contrib;
if(beam_num_scanlines > 5.5)
{
const float3 scanline0_contrib =
scanline_contrib(dist2 + 2.0.xxx, scanline0_color,
ph, sigma_range, shape_range);
const float3 scanline1_contrib =
scanline_contrib(dist2 + 1.0.xxx, scanline1_color,
ph, sigma_range, shape_range);
const float3 scanline4_contrib =
scanline_contrib(abs(2.0.xxx - dist2), scanline4_color,
ph, sigma_range, shape_range);
const float3 scanline5_contrib =
scanline_contrib(abs(3.0.xxx - dist2), scanline5_color,
ph, sigma_range, shape_range);
scanline_intensity += scanline0_contrib + scanline1_contrib +
scanline4_contrib + scanline5_contrib;
}
else if(beam_num_scanlines > 4.5)
{
const float3 scanline1_contrib =
scanline_contrib(dist2 + 1.0.xxx, scanline1_color,
ph, sigma_range, shape_range);
const float3 scanline4_contrib =
scanline_contrib(abs(2.0.xxx - dist2), scanline4_color,
ph, sigma_range, shape_range);
const float3 dist0or5 = lerp(
dist2 + 2.0.xxx, 3.0.xxx - dist2, dist_round);
const float3 scanline0or5_contrib = scanline_contrib(
dist0or5, scanline_outside_color, ph, sigma_range, shape_range);
scanline_intensity += scanline1_contrib + scanline4_contrib +
scanline0or5_contrib;
}
else if(beam_num_scanlines > 3.5)
{
const float3 scanline1_contrib =
scanline_contrib(dist2 + 1.0.xxx, scanline1_color,
ph, sigma_range, shape_range);
const float3 scanline4_contrib =
scanline_contrib(abs(2.0.xxx - dist2), scanline4_color,
ph, sigma_range, shape_range);
scanline_intensity += scanline1_contrib + scanline4_contrib;
}
else if(beam_num_scanlines > 2.5)
{
const float3 dist1or4 = lerp(
dist2 + 1.0.xxx, 2.0.xxx - dist2, dist_round);
const float3 scanline1or4_contrib = scanline_contrib(
dist1or4, scanline_outside_color, ph, sigma_range, shape_range);
scanline_intensity += scanline1or4_contrib;
}
// Auto-dim the image to avoid clipping, encode if necessary, and output.
// My original idea was to compute a minimal auto-dim factor and put it in
// the alpha channel, but it wasn't working, at least not reliably. This
// is faster anyway, levels_autodim_temp = 0.5 isn't causing banding.
return encode_output(float4(scanline_intensity * levels_autodim_temp, 1.0));
}

View file

@ -32,12 +32,11 @@
*/
uniform bool geom_curvature <
ui_type = "radio";
ui_category = "Geom Curvature";
ui_label = "Geom Curvature Toggle";
> = 1.0;
> = 0.0;
uniform float geom_R <
ui_type = "drag";

Binary file not shown.

After

Width:  |  Height:  |  Size: 194 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 214 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 202 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.9 KiB