mirror of
https://github.com/RetroDECK/Duckstation.git
synced 2024-11-23 14:25:37 +00:00
504 lines
18 KiB
HLSL
504 lines
18 KiB
HLSL
#ifndef _SPECIAL_FUNCTIONS_H
|
|
#define _SPECIAL_FUNCTIONS_H
|
|
|
|
///////////////////////////////// MIT LICENSE ////////////////////////////////
|
|
|
|
// Copyright (C) 2014 TroggleMonkey
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to
|
|
// deal in the Software without restriction, including without limitation the
|
|
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
// sell copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
// IN THE SOFTWARE.
|
|
|
|
|
|
///////////////////////////////// DESCRIPTION ////////////////////////////////
|
|
|
|
// This file implements the following mathematical special functions:
|
|
// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
|
|
// 2.) gamma(s), a real-numbered extension of the integer factorial function
|
|
// It also implements normalized_ligamma(s, z), a normalized lower incomplete
|
|
// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can
|
|
// be called with an _impl suffix to use an implementation version with a few
|
|
// extra precomputed parameters (which may be useful for the caller to reuse).
|
|
// See below for details.
|
|
//
|
|
// Design Rationale:
|
|
// Pretty much every line of code in this file is duplicated four times for
|
|
// different input types (float4/float3/float2/float). This is unfortunate,
|
|
// but Cg doesn't allow function templates. Macros would be far less verbose,
|
|
// but they would make the code harder to document and read. I don't expect
|
|
// these functions will require a whole lot of maintenance changes unless
|
|
// someone ever has need for more robust incomplete gamma functions, so code
|
|
// duplication seems to be the lesser evil in this case.
|
|
|
|
|
|
/////////////////////////// GAUSSIAN ERROR FUNCTION //////////////////////////
|
|
|
|
float4 erf6(float4 x)
|
|
{
|
|
// Requires: x is the standard parameter to erf().
|
|
// Returns: Return an Abramowitz/Stegun approximation of erf(), where:
|
|
// erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
|
|
// This approximation has a max absolute error of 2.5*10**-5
|
|
// with solid numerical robustness and efficiency. See:
|
|
// https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
|
|
const float4 sign_x = sign(x);
|
|
const float4 t = 1.0/(1.0 + 0.47047*abs(x));
|
|
const float4 result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
|
|
exp(-(x*x));
|
|
return result * sign_x;
|
|
}
|
|
|
|
float3 erf6(const float3 x)
|
|
{
|
|
// Float3 version:
|
|
const float3 sign_x = sign(x);
|
|
const float3 t = 1.0/(1.0 + 0.47047*abs(x));
|
|
const float3 result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
|
|
exp(-(x*x));
|
|
return result * sign_x;
|
|
}
|
|
|
|
float2 erf6(const float2 x)
|
|
{
|
|
// Float2 version:
|
|
const float2 sign_x = sign(x);
|
|
const float2 t = 1.0/(1.0 + 0.47047*abs(x));
|
|
const float2 result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
|
|
exp(-(x*x));
|
|
return result * sign_x;
|
|
}
|
|
|
|
float erf6(const float x)
|
|
{
|
|
// Float version:
|
|
const float sign_x = sign(x);
|
|
const float t = 1.0/(1.0 + 0.47047*abs(x));
|
|
const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
|
|
exp(-(x*x));
|
|
return result * sign_x;
|
|
}
|
|
|
|
float4 erft(const float4 x)
|
|
{
|
|
// Requires: x is the standard parameter to erf().
|
|
// Returns: Approximate erf() with the hyperbolic tangent. The error is
|
|
// visually noticeable, but it's blazing fast and perceptually
|
|
// close...at least on ATI hardware. See:
|
|
// http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
|
|
// Warning: Only use this if your hardware drivers correctly implement
|
|
// tanh(): My nVidia 8800GTS returns garbage output.
|
|
return tanh(1.202760580 * x);
|
|
}
|
|
|
|
float3 erft(const float3 x)
|
|
{
|
|
// Float3 version:
|
|
return tanh(1.202760580 * x);
|
|
}
|
|
|
|
float2 erft(const float2 x)
|
|
{
|
|
// Float2 version:
|
|
return tanh(1.202760580 * x);
|
|
}
|
|
|
|
float erft(const float x)
|
|
{
|
|
// Float version:
|
|
return tanh(1.202760580 * x);
|
|
}
|
|
|
|
float4 erf(const float4 x)
|
|
{
|
|
// Requires: x is the standard parameter to erf().
|
|
// Returns: Some approximation of erf(x), depending on user settings.
|
|
#ifdef ERF_FAST_APPROXIMATION
|
|
return erft(x);
|
|
#else
|
|
return erf6(x);
|
|
#endif
|
|
}
|
|
|
|
float3 erf(const float3 x)
|
|
{
|
|
// Float3 version:
|
|
#ifdef ERF_FAST_APPROXIMATION
|
|
return erft(x);
|
|
#else
|
|
return erf6(x);
|
|
#endif
|
|
}
|
|
|
|
float2 erf(const float2 x)
|
|
{
|
|
// Float2 version:
|
|
#ifdef ERF_FAST_APPROXIMATION
|
|
return erft(x);
|
|
#else
|
|
return erf6(x);
|
|
#endif
|
|
}
|
|
|
|
float erf(const float x)
|
|
{
|
|
// Float version:
|
|
#ifdef ERF_FAST_APPROXIMATION
|
|
return erft(x);
|
|
#else
|
|
return erf6(x);
|
|
#endif
|
|
}
|
|
|
|
|
|
/////////////////////////// COMPLETE GAMMA FUNCTION //////////////////////////
|
|
|
|
float4 gamma_impl(const float4 s, const float4 s_inv)
|
|
{
|
|
// Requires: 1.) s is the standard parameter to the gamma function, and
|
|
// it should lie in the [0, 36] range.
|
|
// 2.) s_inv = 1.0/s. This implementation function requires
|
|
// the caller to precompute this value, giving users the
|
|
// opportunity to reuse it.
|
|
// Returns: Return approximate gamma function (real-numbered factorial)
|
|
// output using the Lanczos approximation with two coefficients
|
|
// calculated using Paul Godfrey's method here:
|
|
// http://my.fit.edu/~gabdo/gamma.txt
|
|
// An optimal g value for s in [0, 36] is ~1.12906830989, with
|
|
// a maximum relative error of 0.000463 for 2**16 equally
|
|
// evals. We could use three coeffs (0.0000346 error) without
|
|
// hurting latency, but this allows more parallelism with
|
|
// outside instructions.
|
|
static const float g = 1.12906830989;
|
|
static const float c0 = 0.8109119309638332633713423362694399653724431;
|
|
static const float c1 = 0.4808354605142681877121661197951496120000040;
|
|
static const float e = 2.71828182845904523536028747135266249775724709;
|
|
const float4 sph = s + 0.5;
|
|
const float4 lanczos_sum = c0 + c1/(s + 1.0);
|
|
const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e
|
|
// gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
|
|
// This has less error for small s's than (s -= 1.0) at the beginning.
|
|
return (pow(base, sph) * lanczos_sum) * s_inv;
|
|
}
|
|
|
|
float3 gamma_impl(const float3 s, const float3 s_inv)
|
|
{
|
|
// Float3 version:
|
|
static const float g = 1.12906830989;
|
|
static const float c0 = 0.8109119309638332633713423362694399653724431;
|
|
static const float c1 = 0.4808354605142681877121661197951496120000040;
|
|
static const float e = 2.71828182845904523536028747135266249775724709;
|
|
const float3 sph = s + 0.5;
|
|
const float3 lanczos_sum = c0 + c1/(s + 1.0);
|
|
const float3 base = (sph + g)/e;
|
|
return (pow(base, sph) * lanczos_sum) * s_inv;
|
|
}
|
|
|
|
float2 gamma_impl(const float2 s, const float2 s_inv)
|
|
{
|
|
// Float2 version:
|
|
static const float g = 1.12906830989;
|
|
static const float c0 = 0.8109119309638332633713423362694399653724431;
|
|
static const float c1 = 0.4808354605142681877121661197951496120000040;
|
|
static const float e = 2.71828182845904523536028747135266249775724709;
|
|
const float2 sph = s + 0.5;
|
|
const float2 lanczos_sum = c0 + c1/(s + 1.0);
|
|
const float2 base = (sph + g)/e;
|
|
return (pow(base, sph) * lanczos_sum) * s_inv;
|
|
}
|
|
|
|
float gamma_impl(const float s, const float s_inv)
|
|
{
|
|
// Float version:
|
|
static const float g = 1.12906830989;
|
|
static const float c0 = 0.8109119309638332633713423362694399653724431;
|
|
static const float c1 = 0.4808354605142681877121661197951496120000040;
|
|
static const float e = 2.71828182845904523536028747135266249775724709;
|
|
const float sph = s + 0.5;
|
|
const float lanczos_sum = c0 + c1/(s + 1.0);
|
|
const float base = (sph + g)/e;
|
|
return (pow(base, sph) * lanczos_sum) * s_inv;
|
|
}
|
|
|
|
float4 gamma(const float4 s)
|
|
{
|
|
// Requires: s is the standard parameter to the gamma function, and it
|
|
// should lie in the [0, 36] range.
|
|
// Returns: Return approximate gamma function output with a maximum
|
|
// relative error of 0.000463. See gamma_impl for details.
|
|
return gamma_impl(s, 1.0/s);
|
|
}
|
|
|
|
float3 gamma(const float3 s)
|
|
{
|
|
// Float3 version:
|
|
return gamma_impl(s, 1.0/s);
|
|
}
|
|
|
|
float2 gamma(const float2 s)
|
|
{
|
|
// Float2 version:
|
|
return gamma_impl(s, 1.0/s);
|
|
}
|
|
|
|
float gamma(const float s)
|
|
{
|
|
// Float version:
|
|
return gamma_impl(s, 1.0/s);
|
|
}
|
|
|
|
|
|
//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) ///////////////
|
|
|
|
// Lower incomplete gamma function for small s and z (implementation):
|
|
float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
|
|
{
|
|
// Requires: 1.) s < ~0.5
|
|
// 2.) z <= ~0.775075
|
|
// 3.) s_inv = 1.0/s (precomputed for outside reuse)
|
|
// Returns: A series representation for the lower incomplete gamma
|
|
// function for small s and small z (4 terms).
|
|
// The actual "rolled up" summation looks like:
|
|
// last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
|
|
// sum = last_sign * last_pow / ((s + k) * last_factorial)
|
|
// for(int i = 0; i < 4; ++i)
|
|
// {
|
|
// last_sign *= -1.0; last_pow *= z; last_factorial *= i;
|
|
// sum += last_sign * last_pow / ((s + k) * last_factorial);
|
|
// }
|
|
// Unrolled, constant-unfolded and arranged for madds and parallelism:
|
|
const float4 scale = pow(z, s);
|
|
float4 sum = s_inv; // Summation iteration 0 result
|
|
// Summation iterations 1, 2, and 3:
|
|
const float4 z_sq = z*z;
|
|
const float4 denom1 = s + 1.0;
|
|
const float4 denom2 = 2.0*s + 4.0;
|
|
const float4 denom3 = 6.0*s + 18.0;
|
|
//float4 denom4 = 24.0*s + float4(96.0);
|
|
sum -= z/denom1;
|
|
sum += z_sq/denom2;
|
|
sum -= z * z_sq/denom3;
|
|
//sum += z_sq * z_sq / denom4;
|
|
// Scale and return:
|
|
return scale * sum;
|
|
}
|
|
|
|
float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
|
|
{
|
|
// Float3 version:
|
|
const float3 scale = pow(z, s);
|
|
float3 sum = s_inv;
|
|
const float3 z_sq = z*z;
|
|
const float3 denom1 = s + 1.0;
|
|
const float3 denom2 = 2.0*s + 4.0;
|
|
const float3 denom3 = 6.0*s + 18.0;
|
|
sum -= z/denom1;
|
|
sum += z_sq/denom2;
|
|
sum -= z * z_sq/denom3;
|
|
return scale * sum;
|
|
}
|
|
|
|
float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
|
|
{
|
|
// Float2 version:
|
|
const float2 scale = pow(z, s);
|
|
float2 sum = s_inv;
|
|
const float2 z_sq = z*z;
|
|
const float2 denom1 = s + 1.0;
|
|
const float2 denom2 = 2.0*s + 4.0;
|
|
const float2 denom3 = 6.0*s + 18.0;
|
|
sum -= z/denom1;
|
|
sum += z_sq/denom2;
|
|
sum -= z * z_sq/denom3;
|
|
return scale * sum;
|
|
}
|
|
|
|
float ligamma_small_z_impl(const float s, const float z, const float s_inv)
|
|
{
|
|
// Float version:
|
|
const float scale = pow(z, s);
|
|
float sum = s_inv;
|
|
const float z_sq = z*z;
|
|
const float denom1 = s + 1.0;
|
|
const float denom2 = 2.0*s + 4.0;
|
|
const float denom3 = 6.0*s + 18.0;
|
|
sum -= z/denom1;
|
|
sum += z_sq/denom2;
|
|
sum -= z * z_sq/denom3;
|
|
return scale * sum;
|
|
}
|
|
|
|
// Upper incomplete gamma function for small s and large z (implementation):
|
|
float4 uigamma_large_z_impl(const float4 s, const float4 z)
|
|
{
|
|
// Requires: 1.) s < ~0.5
|
|
// 2.) z > ~0.775075
|
|
// Returns: Gauss's continued fraction representation for the upper
|
|
// incomplete gamma function (4 terms).
|
|
// The "rolled up" continued fraction looks like this. The denominator
|
|
// is truncated, and it's calculated "from the bottom up:"
|
|
// denom = float4('inf');
|
|
// float4 one = float4(1.0);
|
|
// for(int i = 4; i > 0; --i)
|
|
// {
|
|
// denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
|
|
// }
|
|
// Unrolled and constant-unfolded for madds and parallelism:
|
|
const float4 numerator = pow(z, s) * exp(-z);
|
|
float4 denom = 7.0 + z - s;
|
|
denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
|
|
denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
|
|
denom = 1.0 + z - s + (s - 1.0)/denom;
|
|
return numerator / denom;
|
|
}
|
|
|
|
float3 uigamma_large_z_impl(const float3 s, const float3 z)
|
|
{
|
|
// Float3 version:
|
|
const float3 numerator = pow(z, s) * exp(-z);
|
|
float3 denom = 7.0 + z - s;
|
|
denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
|
|
denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
|
|
denom = 1.0 + z - s + (s - 1.0)/denom;
|
|
return numerator / denom;
|
|
}
|
|
|
|
float2 uigamma_large_z_impl(const float2 s, const float2 z)
|
|
{
|
|
// Float2 version:
|
|
const float2 numerator = pow(z, s) * exp(-z);
|
|
float2 denom = 7.0 + z - s;
|
|
denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
|
|
denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
|
|
denom = 1.0 + z - s + (s - 1.0)/denom;
|
|
return numerator / denom;
|
|
}
|
|
|
|
float uigamma_large_z_impl(const float s, const float z)
|
|
{
|
|
// Float version:
|
|
const float numerator = pow(z, s) * exp(-z);
|
|
float denom = 7.0 + z - s;
|
|
denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
|
|
denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
|
|
denom = 1.0 + z - s + (s - 1.0)/denom;
|
|
return numerator / denom;
|
|
}
|
|
|
|
// Normalized lower incomplete gamma function for small s (implementation):
|
|
float4 normalized_ligamma_impl(const float4 s, const float4 z,
|
|
const float4 s_inv, const float4 gamma_s_inv)
|
|
{
|
|
// Requires: 1.) s < ~0.5
|
|
// 2.) s_inv = 1/s (precomputed for outside reuse)
|
|
// 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
|
|
// Returns: Approximate the normalized lower incomplete gamma function
|
|
// for s < 0.5. Since we only care about s < 0.5, we only need
|
|
// to evaluate two branches (not four) based on z. Each branch
|
|
// uses four terms, with a max relative error of ~0.00182. The
|
|
// branch threshold and specifics were adapted for fewer terms
|
|
// from Gil/Segura/Temme's paper here:
|
|
// http://oai.cwi.nl/oai/asset/20433/20433B.pdf
|
|
// Evaluate both branches: Real branches test slower even when available.
|
|
static const float thresh = 0.775075;
|
|
int4 z_is_large;
|
|
z_is_large.x = int(z.x > thresh);
|
|
z_is_large.y = int(z.y > thresh);
|
|
z_is_large.z = int(z.z > thresh);
|
|
z_is_large.w = int(z.w > thresh);
|
|
const float4 large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
|
|
const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
|
|
// Combine the results from both branches:
|
|
int4 inverse_z_is_large = saturate(~(z_is_large));
|
|
return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
|
|
}
|
|
|
|
float3 normalized_ligamma_impl(const float3 s, const float3 z,
|
|
const float3 s_inv, const float3 gamma_s_inv)
|
|
{
|
|
// Float3 version:
|
|
static const float thresh = 0.775075;
|
|
int3 z_is_large;
|
|
z_is_large.x = int(z.x > thresh);
|
|
z_is_large.y = int(z.y > thresh);
|
|
z_is_large.z = int(z.z > thresh);
|
|
const float3 large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
|
|
const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
|
|
int3 inverse_z_is_large = saturate(~(z_is_large));
|
|
return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
|
|
}
|
|
|
|
float2 normalized_ligamma_impl(const float2 s, const float2 z,
|
|
const float2 s_inv, const float2 gamma_s_inv)
|
|
{
|
|
// Float2 version:
|
|
static const float thresh = 0.775075;
|
|
int2 z_is_large;
|
|
z_is_large.x = int(z.x > thresh);
|
|
z_is_large.y = int(z.y > thresh);
|
|
const float2 large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
|
|
const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
|
|
int2 inverse_z_is_large = saturate(~(z_is_large));
|
|
return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
|
|
}
|
|
|
|
float normalized_ligamma_impl(const float s, const float z,
|
|
const float s_inv, const float gamma_s_inv)
|
|
{
|
|
// Float version:
|
|
static const float thresh = 0.775075;
|
|
const bool z_is_large = z > thresh;
|
|
const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
|
|
const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
|
|
return large_z * float(z_is_large) + small_z * float(!z_is_large);
|
|
}
|
|
|
|
// Normalized lower incomplete gamma function for small s:
|
|
float4 normalized_ligamma(const float4 s, const float4 z)
|
|
{
|
|
// Requires: s < ~0.5
|
|
// Returns: Approximate the normalized lower incomplete gamma function
|
|
// for s < 0.5. See normalized_ligamma_impl() for details.
|
|
const float4 s_inv = 1.0/s;
|
|
const float4 gamma_s_inv = 1.0/gamma_impl(s, s_inv);
|
|
return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
|
|
}
|
|
|
|
float3 normalized_ligamma(const float3 s, const float3 z)
|
|
{
|
|
// Float3 version:
|
|
const float3 s_inv = 1.0/s;
|
|
const float3 gamma_s_inv = 1.0/gamma_impl(s, s_inv);
|
|
return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
|
|
}
|
|
|
|
float2 normalized_ligamma(const float2 s, const float2 z)
|
|
{
|
|
// Float2 version:
|
|
const float2 s_inv = 1.0/s;
|
|
const float2 gamma_s_inv = 1.0/gamma_impl(s, s_inv);
|
|
return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
|
|
}
|
|
|
|
float normalized_ligamma(const float s, const float z)
|
|
{
|
|
// Float version:
|
|
const float s_inv = 1.0/s;
|
|
const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
|
|
return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
|
|
}
|
|
|
|
#endif // _SPECIAL_FUNCTIONS_H |