Duckstation/data/resources/shaders/reshade/Shaders/crt/crt-royale/include/special-functions.fxh

#ifndef SPECIAL_FUNCTIONS_H
#define SPECIAL_FUNCTIONS_H

/////////////////////////////////  MIT LICENSE  ////////////////////////////////

//  Copyright (C) 2014 TroggleMonkey
//
//  Permission is hereby granted, free of charge, to any person obtaining a copy
//  of this software and associated documentation files (the "Software"), to
//  deal in the Software without restriction, including without limitation the
//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
//  sell copies of the Software, and to permit persons to whom the Software is
//  furnished to do so, subject to the following conditions:
//
//  The above copyright notice and this permission notice shall be included in
//  all copies or substantial portions of the Software.
//
//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
//  IN THE SOFTWARE.


/////////////////////////////////  DESCRIPTION  ////////////////////////////////

//  This file implements the following mathematical special functions:
//  1.) erf() = 2/sqrt(pi) * indefinite_integral(e**(-x**2))
//  2.) gamma(s), a real-numbered extension of the integer factorial function
//  It also implements normalized_ligamma(s, z), a normalized lower incomplete
//  gamma function for s < 0.5 only.  Both gamma() and normalized_ligamma() can
//  be called with an _impl suffix to use an implementation version with a few
//  extra precomputed parameters (which may be useful for the caller to reuse).
//  See below for details.
//
//  Design Rationale:
//  Pretty much every line of code in this file is duplicated four times for
//  different input types (float4/float3/float2/float).  This is unfortunate,
//  but Cg doesn't allow function templates.  Macros would be far less verbose,
//  but they would make the code harder to document and read.  I don't expect
//  these functions will require a whole lot of maintenance changes unless
//  someone ever has need for more robust incomplete gamma functions, so code
//  duplication seems to be the lesser evil in this case.


///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////

float4 erf6(float4 x)
{
    //  Requires:   x is the standard parameter to erf().
    //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
    //                  erf(x) = 2/sqrt(pi) * integral(e**(-x**2))
    //              This approximation has a max absolute error of 2.5*10**-5
    //              with solid numerical robustness and efficiency.  See:
	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
	static const float4 one = 1.0.xxxx;
	const float4 sign_x = sign(x);
	const float4 t = one/(one + 0.47047*abs(x));
	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
		exp(-(x*x));
	return result * sign_x;
}

float3 erf6(const float3 x)
{
    //  Float3 version:
	static const float3 one = 1.0.xxx;
	const float3 sign_x = sign(x);
	const float3 t = one/(one + 0.47047*abs(x));
	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
		exp(-(x*x));
	return result * sign_x;
}

float2 erf6(const float2 x)
{
    //  Float2 version:
	static const float2 one = 1.0.xx;
	const float2 sign_x = sign(x);
	const float2 t = one/(one + 0.47047*abs(x));
	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
		exp(-(x*x));
	return result * sign_x;
}

float erf6(const float x)
{
    //  Float version:
	const float sign_x = sign(x);
	const float t = 1.0/(1.0 + 0.47047*abs(x));
	const float result = 1.0 - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
		exp(-(x*x));
	return result * sign_x;
}

float4 erft(const float4 x)
{
    //  Requires:   x is the standard parameter to erf().
    //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
    //              visually noticeable, but it's blazing fast and perceptually
    //              close...at least on ATI hardware.  See:
    //                  http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html
    //  Warning:    Only use this if your hardware drivers correctly implement
    //              tanh(): My nVidia 8800GTS returns garbage output.
	return tanh(1.202760580 * x);
}

float3 erft(const float3 x)
{
    //  Float3 version:
	return tanh(1.202760580 * x);
}

float2 erft(const float2 x)
{
    //  Float2 version:
	return tanh(1.202760580 * x);
}

float erft(const float x)
{
    //  Float version:
	return tanh(1.202760580 * x);
}

float4 erf(const float4 x)
{
    //  Requires:   x is the standard parameter to erf().
    //  Returns:    Some approximation of erf(x), depending on user settings.
	#ifdef ERF_FAST_APPROXIMATION
		return erft(x);
	#else
		return erf6(x);
	#endif
}

float3 erf(const float3 x)
{
    //  Float3 version:
	#ifdef ERF_FAST_APPROXIMATION
		return erft(x);
	#else
		return erf6(x);
	#endif
}

float2 erf(const float2 x)
{
    //  Float2 version:
	#ifdef ERF_FAST_APPROXIMATION
		return erft(x);
	#else
		return erf6(x);
	#endif
}

float erf(const float x)
{
    //  Float version:
	#ifdef ERF_FAST_APPROXIMATION
		return erft(x);
	#else
		return erf6(x);
	#endif
}


///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////

float4 gamma_impl(const float4 s, const float4 s_inv)
{
    //  Requires:   1.) s is the standard parameter to the gamma function, and
    //                  it should lie in the [0, 36] range.
    //              2.) s_inv = 1.0/s.  This implementation function requires
    //                  the caller to precompute this value, giving users the
    //                  opportunity to reuse it.
    //  Returns:    Return approximate gamma function (real-numbered factorial)
    //              output using the Lanczos approximation with two coefficients
    //              calculated using Paul Godfrey's method here:
    //                  http://my.fit.edu/~gabdo/gamma.txt
    //              An optimal g value for s in [0, 36] is ~1.12906830989, with
    //              a maximum relative error of 0.000463 for 2**16 equally
    //              evals.  We could use three coeffs (0.0000346 error) without
    //              hurting latency, but this allows more parallelism with
    //              outside instructions.
	static const float4 g = 1.12906830989.xxxx;
	static const float4 c0 = 0.8109119309638332633713423362694399653724431.xxxx;
	static const float4 c1 = 0.4808354605142681877121661197951496120000040.xxxx;
	static const float4 e = 2.71828182845904523536028747135266249775724709.xxxx;
	const float4 sph = s + 0.5.xxxx;
	const float4 lanczos_sum = c0 + c1/(s + 1.0.xxxx);
	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
	//  This has less error for small s's than (s -= 1.0) at the beginning.
	return (pow(base, sph) * lanczos_sum) * s_inv;
}

float3 gamma_impl(const float3 s, const float3 s_inv)
{
    //  Float3 version:
	static const float3 g = 1.12906830989.xxx;
	static const float3 c0 = 0.8109119309638332633713423362694399653724431.xxx;
	static const float3 c1 = 0.4808354605142681877121661197951496120000040.xxx;
	static const float3 e = 2.71828182845904523536028747135266249775724709.xxx;
	const float3 sph = s + 0.5.xxx;
	const float3 lanczos_sum = c0 + c1/(s + 1.0.xxx);
	const float3 base = (sph + g)/e;
	return (pow(base, sph) * lanczos_sum) * s_inv;
}

float2 gamma_impl(const float2 s, const float2 s_inv)
{
    //  Float2 version:
	static const float2 g = 1.12906830989.xx;
	static const float2 c0 = 0.8109119309638332633713423362694399653724431.xx;
	static const float2 c1 = 0.4808354605142681877121661197951496120000040.xx;
	static const float2 e = 2.71828182845904523536028747135266249775724709.xx;
	const float2 sph = s + 0.5.xx;
	const float2 lanczos_sum = c0 + c1/(s + 1.0.xx);
	const float2 base = (sph + g)/e;
	return (pow(base, sph) * lanczos_sum) * s_inv;
}

float gamma_impl(const float s, const float s_inv)
{
    //  Float version:
	static const float g = 1.12906830989;
	static const float c0 = 0.8109119309638332633713423362694399653724431;
	static const float c1 = 0.4808354605142681877121661197951496120000040;
	static const float e = 2.71828182845904523536028747135266249775724709;
	const float sph = s + 0.5;
	const float lanczos_sum = c0 + c1/(s + 1.0);
	const float base = (sph + g)/e;
	return (pow(base, sph) * lanczos_sum) * s_inv;
}

float4 gamma(const float4 s)
{
    //  Requires:   s is the standard parameter to the gamma function, and it
    //              should lie in the [0, 36] range.
    //  Returns:    Return approximate gamma function output with a maximum
    //              relative error of 0.000463.  See gamma_impl for details.
	return gamma_impl(s, 1.0.xxxx/s);
}

float3 gamma(const float3 s)
{
    //  Float3 version:
	return gamma_impl(s, 1.0.xxx/s);
}

float2 gamma(const float2 s)
{
    //  Float2 version:
	return gamma_impl(s, 1.0.xx/s);
}

float gamma(const float s)
{
    //  Float version:
	return gamma_impl(s, 1.0/s);
}


////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////

//  Lower incomplete gamma function for small s and z (implementation):
float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
{
    //  Requires:   1.) s < ~0.5
    //              2.) z <= ~0.775075
    //              3.) s_inv = 1.0/s (precomputed for outside reuse)
    //  Returns:    A series representation for the lower incomplete gamma
    //              function for small s and small z (4 terms).
    //  The actual "rolled up" summation looks like:
	//      last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;
	//      sum = last_sign * last_pow / ((s + k) * last_factorial)
	//      for(int i = 0; i < 4; ++i)
	//      {
	//          last_sign *= -1.0; last_pow *= z; last_factorial *= i;
	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
	//      }
	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
	const float4 scale = pow(z, s);
	float4 sum = s_inv;  //  Summation iteration 0 result
	//  Summation iterations 1, 2, and 3:
	const float4 z_sq = z*z;
	const float4 denom1 = s + 1.0.xxxx;
	const float4 denom2 = 2.0*s + 4.0.xxxx;
	const float4 denom3 = 6.0*s + 18.0.xxxx;
	//float4 denom4 = 24.0*s + float4(96.0);
	sum -= z/denom1;
	sum += z_sq/denom2;
	sum -= z * z_sq/denom3;
	//sum += z_sq * z_sq / denom4;
	//  Scale and return:
	return scale * sum;
}

float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
{
    //  Float3 version:
	const float3 scale = pow(z, s);
	float3 sum = s_inv;
	const float3 z_sq = z*z;
	const float3 denom1 = s + 1.0.xxx;
	const float3 denom2 = 2.0*s + 4.0.xxx;
	const float3 denom3 = 6.0*s + 18.0.xxx;
	sum -= z/denom1;
	sum += z_sq/denom2;
	sum -= z * z_sq/denom3;
	return scale * sum;
}

float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
{
    //  Float2 version:
	const float2 scale = pow(z, s);
	float2 sum = s_inv;
	const float2 z_sq = z*z;
	const float2 denom1 = s + 1.0.xx;
	const float2 denom2 = 2.0*s + 4.0.xx;
	const float2 denom3 = 6.0*s + 18.0.xx;
	sum -= z/denom1;
	sum += z_sq/denom2;
	sum -= z * z_sq/denom3;
	return scale * sum;
}

float ligamma_small_z_impl(const float s, const float z, const float s_inv)
{
    //  Float version:
	const float scale = pow(z, s);
	float sum = s_inv;
	const float z_sq = z*z;
	const float denom1 = s + 1.0;
	const float denom2 = 2.0*s + 4.0;
	const float denom3 = 6.0*s + 18.0;
	sum -= z/denom1;
	sum += z_sq/denom2;
	sum -= z * z_sq/denom3;
	return scale * sum;
}

//  Upper incomplete gamma function for small s and large z (implementation):
float4 uigamma_large_z_impl(const float4 s, const float4 z)
{
    //  Requires:   1.) s < ~0.5
    //              2.) z > ~0.775075
    //  Returns:    Gauss's continued fraction representation for the upper
    //              incomplete gamma function (4 terms).
	//  The "rolled up" continued fraction looks like this.  The denominator
    //  is truncated, and it's calculated "from the bottom up:"
	//      denom = float4('inf');
	//      float4 one = float4(1.0);
	//      for(int i = 4; i > 0; --i)
	//      {
	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
	//      }
	//  Unrolled and constant-unfolded for madds and parallelism:
	const float4 numerator = pow(z, s) * exp(-z);
	float4 denom = 7.0.xxxx + z - s;
	denom = 5.0.xxxx + z - s + (3.0*s - 9.0.xxxx)/denom;
	denom = 3.0.xxxx + z - s + (2.0*s - 4.0.xxxx)/denom;
	denom = 1.0.xxxx + z - s + (s - 1.0.xxxx)/denom;
	return numerator / denom;
}

float3 uigamma_large_z_impl(const float3 s, const float3 z)
{
    //  Float3 version:
	const float3 numerator = pow(z, s) * exp(-z);
	float3 denom = 7.0.xxx + z - s;
	denom = 5.0.xxx + z - s + (3.0*s - 9.0.xxx)/denom;
	denom = 3.0.xxx + z - s + (2.0*s - 4.0.xxx)/denom;
	denom = 1.0.xxx + z - s + (s - 1.0.xxx)/denom;
	return numerator / denom;
}

float2 uigamma_large_z_impl(const float2 s, const float2 z)
{
    //  Float2 version:
	const float2 numerator = pow(z, s) * exp(-z);
	float2 denom = 7.0.xx + z - s;
	denom = 5.0.xx + z - s + (3.0*s - 9.0.xx)/denom;
	denom = 3.0.xx + z - s + (2.0*s - 4.0.xx)/denom;
	denom = 1.0.xx + z - s + (s - 1.0.xx)/denom;
	return numerator / denom;
}

float uigamma_large_z_impl(const float s, const float z)
{
    //  Float version:
	const float numerator = pow(z, s) * exp(-z);
	float denom = 7.0 + z - s;
	denom = 5.0 + z - s + (3.0*s - 9.0)/denom;
	denom = 3.0 + z - s + (2.0*s - 4.0)/denom;
	denom = 1.0 + z - s + (s - 1.0)/denom;
	return numerator / denom;
}

//  Normalized lower incomplete gamma function for small s (implementation):
float4 normalized_ligamma_impl(const float4 s, const float4 z,
    const float4 s_inv, const float4 gamma_s_inv)
{
    //  Requires:   1.) s < ~0.5
    //              2.) s_inv = 1/s (precomputed for outside reuse)
    //              3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)
    //  Returns:    Approximate the normalized lower incomplete gamma function
    //              for s < 0.5.  Since we only care about s < 0.5, we only need
    //              to evaluate two branches (not four) based on z.  Each branch
    //              uses four terms, with a max relative error of ~0.00182.  The
    //              branch threshold and specifics were adapted for fewer terms
    //              from Gil/Segura/Temme's paper here:
    //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
	//  Evaluate both branches: Real branches test slower even when available.
	static const float4 thresh = 0.775075.xxxx;
	const bool4 z_is_large = z > thresh;
	const float4 large_z = 1.0.xxxx - uigamma_large_z_impl(s, z) * gamma_s_inv;
	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
	//  Combine the results from both branches:
	return large_z * float4(z_is_large.xxxx) + small_z * float4(!z_is_large.xxxx);
}

float3 normalized_ligamma_impl(const float3 s, const float3 z,
    const float3 s_inv, const float3 gamma_s_inv)
{
    //  Float3 version:
	static const float3 thresh = 0.775075.xxx;
	const bool3 z_is_large = z > thresh;
	const float3 large_z = 1.0.xxx - uigamma_large_z_impl(s, z) * gamma_s_inv;
	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
	return large_z * float3(z_is_large.xxx) + small_z * float3(!z_is_large.xxx);
}

float2 normalized_ligamma_impl(const float2 s, const float2 z,
    const float2 s_inv, const float2 gamma_s_inv)
{
    //  Float2 version:
	static const float2 thresh = 0.775075.xx;
	const bool2 z_is_large = z > thresh;
	const float2 large_z = 1.0.xx - uigamma_large_z_impl(s, z) * gamma_s_inv;
	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
	return large_z * float2(z_is_large.xx) + small_z * float2(!z_is_large.xx);
}

float normalized_ligamma_impl(const float s, const float z,
    const float s_inv, const float gamma_s_inv)
{
    //  Float version:
	static const float thresh = 0.775075;
	const bool z_is_large = z > thresh;
	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
	return large_z * float(z_is_large) + small_z * float(!z_is_large);
}

//  Normalized lower incomplete gamma function for small s:
float4 normalized_ligamma(const float4 s, const float4 z)
{
    //  Requires:   s < ~0.5
    //  Returns:    Approximate the normalized lower incomplete gamma function
    //              for s < 0.5.  See normalized_ligamma_impl() for details.
	const float4 s_inv = 1.0.xxxx/s;
	const float4 gamma_s_inv = 1.0.xxxx/gamma_impl(s, s_inv);
	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
}

float3 normalized_ligamma(const float3 s, const float3 z)
{
    //  Float3 version:
	const float3 s_inv = 1.0.xxx/s;
	const float3 gamma_s_inv = 1.0.xxx/gamma_impl(s, s_inv);
	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
}

float2 normalized_ligamma(const float2 s, const float2 z)
{
    //  Float2 version:
	const float2 s_inv = 1.0.xx/s;
	const float2 gamma_s_inv = 1.0.xx/gamma_impl(s, s_inv);
	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
}

float normalized_ligamma(const float s, const float z)
{
    //  Float version:
	const float s_inv = 1.0/s;
	const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);
	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
}


#endif  //  SPECIAL_FUNCTIONS_H
Add a new port of crt-royale.fx (#3260) - A new port of crt-royale. More faithful to original. It uses the same mask textures. - The only thing not ported is the original geometry pass. It was replaced by geom curvature code. - It's configured for 1080p displays. 4k displays need to adjust param mask_triad_size_desired from 3.0 to 4.0. OBS: It's up to you decide if the two versions should be maintained. 2024-07-26 04:33:01 +00:00			`#ifndef SPECIAL_FUNCTIONS_H`
			`#define SPECIAL_FUNCTIONS_H`

			`///////////////////////////////// MIT LICENSE ////////////////////////////////`

			`// Copyright (C) 2014 TroggleMonkey`
			`//`
			`// Permission is hereby granted, free of charge, to any person obtaining a copy`
			`// of this software and associated documentation files (the "Software"), to`
			`// deal in the Software without restriction, including without limitation the`
			`// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or`
			`// sell copies of the Software, and to permit persons to whom the Software is`
			`// furnished to do so, subject to the following conditions:`
			`//`
			`// The above copyright notice and this permission notice shall be included in`
			`// all copies or substantial portions of the Software.`
			`//`
			`// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING`
			`// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS`
			`// IN THE SOFTWARE.`


			`///////////////////////////////// DESCRIPTION ////////////////////////////////`

			`// This file implements the following mathematical special functions:`
			`// 1.) erf() = 2/sqrt(pi) * indefinite_integral(e(-x2))`
			`// 2.) gamma(s), a real-numbered extension of the integer factorial function`
			`// It also implements normalized_ligamma(s, z), a normalized lower incomplete`
			`// gamma function for s < 0.5 only. Both gamma() and normalized_ligamma() can`
			`// be called with an _impl suffix to use an implementation version with a few`
			`// extra precomputed parameters (which may be useful for the caller to reuse).`
			`// See below for details.`
			`//`
			`// Design Rationale:`
			`// Pretty much every line of code in this file is duplicated four times for`
			`// different input types (float4/float3/float2/float). This is unfortunate,`
			`// but Cg doesn't allow function templates. Macros would be far less verbose,`
			`// but they would make the code harder to document and read. I don't expect`
			`// these functions will require a whole lot of maintenance changes unless`
			`// someone ever has need for more robust incomplete gamma functions, so code`
			`// duplication seems to be the lesser evil in this case.`


			`/////////////////////////// GAUSSIAN ERROR FUNCTION //////////////////////////`

			`float4 erf6(float4 x)`
			`{`
			`// Requires: x is the standard parameter to erf().`
			`// Returns: Return an Abramowitz/Stegun approximation of erf(), where:`
			`// erf(x) = 2/sqrt(pi) * integral(e(-x2))`
			`// This approximation has a max absolute error of 2.510*-5`
			`// with solid numerical robustness and efficiency. See:`
			`// https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions`
			`static const float4 one = 1.0.xxxx;`
			`const float4 sign_x = sign(x);`
			`const float4 t = one/(one + 0.47047*abs(x));`
			`const float4 result = one - t(0.3480242 + t(-0.0958798 + t0.7478556))`
			`exp(-(x*x));`
			`return result * sign_x;`
			`}`

			`float3 erf6(const float3 x)`
			`{`
			`// Float3 version:`
			`static const float3 one = 1.0.xxx;`
			`const float3 sign_x = sign(x);`
			`const float3 t = one/(one + 0.47047*abs(x));`
			`const float3 result = one - t(0.3480242 + t(-0.0958798 + t0.7478556))`
			`exp(-(x*x));`
			`return result * sign_x;`
			`}`

			`float2 erf6(const float2 x)`
			`{`
			`// Float2 version:`
			`static const float2 one = 1.0.xx;`
			`const float2 sign_x = sign(x);`
			`const float2 t = one/(one + 0.47047*abs(x));`
			`const float2 result = one - t(0.3480242 + t(-0.0958798 + t0.7478556))`
			`exp(-(x*x));`
			`return result * sign_x;`
			`}`

			`float erf6(const float x)`
			`{`
			`// Float version:`
			`const float sign_x = sign(x);`
			`const float t = 1.0/(1.0 + 0.47047*abs(x));`
			`const float result = 1.0 - t(0.3480242 + t(-0.0958798 + t0.7478556))`
			`exp(-(x*x));`
			`return result * sign_x;`
			`}`

			`float4 erft(const float4 x)`
			`{`
			`// Requires: x is the standard parameter to erf().`
			`// Returns: Approximate erf() with the hyperbolic tangent. The error is`
			`// visually noticeable, but it's blazing fast and perceptually`
			`// close...at least on ATI hardware. See:`
			`// http://www.maplesoft.com/applications/view.aspx?SID=5525&view=html`
			`// Warning: Only use this if your hardware drivers correctly implement`
			`// tanh(): My nVidia 8800GTS returns garbage output.`
			`return tanh(1.202760580 * x);`
			`}`

			`float3 erft(const float3 x)`
			`{`
			`// Float3 version:`
			`return tanh(1.202760580 * x);`
			`}`

			`float2 erft(const float2 x)`
			`{`
			`// Float2 version:`
			`return tanh(1.202760580 * x);`
			`}`

			`float erft(const float x)`
			`{`
			`// Float version:`
			`return tanh(1.202760580 * x);`
			`}`

			`float4 erf(const float4 x)`
			`{`
			`// Requires: x is the standard parameter to erf().`
			`// Returns: Some approximation of erf(x), depending on user settings.`
			`#ifdef ERF_FAST_APPROXIMATION`
			`return erft(x);`
			`#else`
			`return erf6(x);`
			`#endif`
			`}`

			`float3 erf(const float3 x)`
			`{`
			`// Float3 version:`
			`#ifdef ERF_FAST_APPROXIMATION`
			`return erft(x);`
			`#else`
			`return erf6(x);`
			`#endif`
			`}`

			`float2 erf(const float2 x)`
			`{`
			`// Float2 version:`
			`#ifdef ERF_FAST_APPROXIMATION`
			`return erft(x);`
			`#else`
			`return erf6(x);`
			`#endif`
			`}`

			`float erf(const float x)`
			`{`
			`// Float version:`
			`#ifdef ERF_FAST_APPROXIMATION`
			`return erft(x);`
			`#else`
			`return erf6(x);`
			`#endif`
			`}`


			`/////////////////////////// COMPLETE GAMMA FUNCTION //////////////////////////`

			`float4 gamma_impl(const float4 s, const float4 s_inv)`
			`{`
			`// Requires: 1.) s is the standard parameter to the gamma function, and`
			`// it should lie in the [0, 36] range.`
			`// 2.) s_inv = 1.0/s. This implementation function requires`
			`// the caller to precompute this value, giving users the`
			`// opportunity to reuse it.`
			`// Returns: Return approximate gamma function (real-numbered factorial)`
			`// output using the Lanczos approximation with two coefficients`
			`// calculated using Paul Godfrey's method here:`
			`// http://my.fit.edu/~gabdo/gamma.txt`
			`// An optimal g value for s in [0, 36] is ~1.12906830989, with`
			`// a maximum relative error of 0.000463 for 2**16 equally`
			`// evals. We could use three coeffs (0.0000346 error) without`
			`// hurting latency, but this allows more parallelism with`
			`// outside instructions.`
			`static const float4 g = 1.12906830989.xxxx;`
			`static const float4 c0 = 0.8109119309638332633713423362694399653724431.xxxx;`
			`static const float4 c1 = 0.4808354605142681877121661197951496120000040.xxxx;`
			`static const float4 e = 2.71828182845904523536028747135266249775724709.xxxx;`
			`const float4 sph = s + 0.5.xxxx;`
			`const float4 lanczos_sum = c0 + c1/(s + 1.0.xxxx);`
			`const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e`
			`// gamma(s + 1) = base*sph lanczos_sum; divide by s for gamma(s).`
			`// This has less error for small s's than (s -= 1.0) at the beginning.`
			`return (pow(base, sph) * lanczos_sum) * s_inv;`
			`}`

			`float3 gamma_impl(const float3 s, const float3 s_inv)`
			`{`
			`// Float3 version:`
			`static const float3 g = 1.12906830989.xxx;`
			`static const float3 c0 = 0.8109119309638332633713423362694399653724431.xxx;`
			`static const float3 c1 = 0.4808354605142681877121661197951496120000040.xxx;`
			`static const float3 e = 2.71828182845904523536028747135266249775724709.xxx;`
			`const float3 sph = s + 0.5.xxx;`
			`const float3 lanczos_sum = c0 + c1/(s + 1.0.xxx);`
			`const float3 base = (sph + g)/e;`
			`return (pow(base, sph) * lanczos_sum) * s_inv;`
			`}`

			`float2 gamma_impl(const float2 s, const float2 s_inv)`
			`{`
			`// Float2 version:`
			`static const float2 g = 1.12906830989.xx;`
			`static const float2 c0 = 0.8109119309638332633713423362694399653724431.xx;`
			`static const float2 c1 = 0.4808354605142681877121661197951496120000040.xx;`
			`static const float2 e = 2.71828182845904523536028747135266249775724709.xx;`
			`const float2 sph = s + 0.5.xx;`
			`const float2 lanczos_sum = c0 + c1/(s + 1.0.xx);`
			`const float2 base = (sph + g)/e;`
			`return (pow(base, sph) * lanczos_sum) * s_inv;`
			`}`

			`float gamma_impl(const float s, const float s_inv)`
			`{`
			`// Float version:`
			`static const float g = 1.12906830989;`
			`static const float c0 = 0.8109119309638332633713423362694399653724431;`
			`static const float c1 = 0.4808354605142681877121661197951496120000040;`
			`static const float e = 2.71828182845904523536028747135266249775724709;`
			`const float sph = s + 0.5;`
			`const float lanczos_sum = c0 + c1/(s + 1.0);`
			`const float base = (sph + g)/e;`
			`return (pow(base, sph) * lanczos_sum) * s_inv;`
			`}`

			`float4 gamma(const float4 s)`
			`{`
			`// Requires: s is the standard parameter to the gamma function, and it`
			`// should lie in the [0, 36] range.`
			`// Returns: Return approximate gamma function output with a maximum`
			`// relative error of 0.000463. See gamma_impl for details.`
			`return gamma_impl(s, 1.0.xxxx/s);`
			`}`

			`float3 gamma(const float3 s)`
			`{`
			`// Float3 version:`
			`return gamma_impl(s, 1.0.xxx/s);`
			`}`

			`float2 gamma(const float2 s)`
			`{`
			`// Float2 version:`
			`return gamma_impl(s, 1.0.xx/s);`
			`}`

			`float gamma(const float s)`
			`{`
			`// Float version:`
			`return gamma_impl(s, 1.0/s);`
			`}`


			`//////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) ///////////////`

			`// Lower incomplete gamma function for small s and z (implementation):`
			`float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)`
			`{`
			`// Requires: 1.) s < ~0.5`
			`// 2.) z <= ~0.775075`
			`// 3.) s_inv = 1.0/s (precomputed for outside reuse)`
			`// Returns: A series representation for the lower incomplete gamma`
			`// function for small s and small z (4 terms).`
			`// The actual "rolled up" summation looks like:`
			`// last_sign = 1.0; last_pow = 1.0; last_factorial = 1.0;`
			`// sum = last_sign * last_pow / ((s + k) * last_factorial)`
			`// for(int i = 0; i < 4; ++i)`
			`// {`
			`// last_sign = -1.0; last_pow = z; last_factorial *= i;`
			`// sum += last_sign * last_pow / ((s + k) * last_factorial);`
			`// }`
			`// Unrolled, constant-unfolded and arranged for madds and parallelism:`
			`const float4 scale = pow(z, s);`
			`float4 sum = s_inv; // Summation iteration 0 result`
			`// Summation iterations 1, 2, and 3:`
			`const float4 z_sq = z*z;`
			`const float4 denom1 = s + 1.0.xxxx;`
			`const float4 denom2 = 2.0*s + 4.0.xxxx;`
			`const float4 denom3 = 6.0*s + 18.0.xxxx;`
			`//float4 denom4 = 24.0*s + float4(96.0);`
			`sum -= z/denom1;`
			`sum += z_sq/denom2;`
			`sum -= z * z_sq/denom3;`
			`//sum += z_sq * z_sq / denom4;`
			`// Scale and return:`
			`return scale * sum;`
			`}`

			`float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)`
			`{`
			`// Float3 version:`
			`const float3 scale = pow(z, s);`
			`float3 sum = s_inv;`
			`const float3 z_sq = z*z;`
			`const float3 denom1 = s + 1.0.xxx;`
			`const float3 denom2 = 2.0*s + 4.0.xxx;`
			`const float3 denom3 = 6.0*s + 18.0.xxx;`
			`sum -= z/denom1;`
			`sum += z_sq/denom2;`
			`sum -= z * z_sq/denom3;`
			`return scale * sum;`
			`}`

			`float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)`
			`{`
			`// Float2 version:`
			`const float2 scale = pow(z, s);`
			`float2 sum = s_inv;`
			`const float2 z_sq = z*z;`
			`const float2 denom1 = s + 1.0.xx;`
			`const float2 denom2 = 2.0*s + 4.0.xx;`
			`const float2 denom3 = 6.0*s + 18.0.xx;`
			`sum -= z/denom1;`
			`sum += z_sq/denom2;`
			`sum -= z * z_sq/denom3;`
			`return scale * sum;`
			`}`

			`float ligamma_small_z_impl(const float s, const float z, const float s_inv)`
			`{`
			`// Float version:`
			`const float scale = pow(z, s);`
			`float sum = s_inv;`
			`const float z_sq = z*z;`
			`const float denom1 = s + 1.0;`
			`const float denom2 = 2.0*s + 4.0;`
			`const float denom3 = 6.0*s + 18.0;`
			`sum -= z/denom1;`
			`sum += z_sq/denom2;`
			`sum -= z * z_sq/denom3;`
			`return scale * sum;`
			`}`

			`// Upper incomplete gamma function for small s and large z (implementation):`
			`float4 uigamma_large_z_impl(const float4 s, const float4 z)`
			`{`
			`// Requires: 1.) s < ~0.5`
			`// 2.) z > ~0.775075`
			`// Returns: Gauss's continued fraction representation for the upper`
			`// incomplete gamma function (4 terms).`
			`// The "rolled up" continued fraction looks like this. The denominator`
			`// is truncated, and it's calculated "from the bottom up:"`
			`// denom = float4('inf');`
			`// float4 one = float4(1.0);`
			`// for(int i = 4; i > 0; --i)`
			`// {`
			`// denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;`
			`// }`
			`// Unrolled and constant-unfolded for madds and parallelism:`
			`const float4 numerator = pow(z, s) * exp(-z);`
			`float4 denom = 7.0.xxxx + z - s;`
			`denom = 5.0.xxxx + z - s + (3.0*s - 9.0.xxxx)/denom;`
			`denom = 3.0.xxxx + z - s + (2.0*s - 4.0.xxxx)/denom;`
			`denom = 1.0.xxxx + z - s + (s - 1.0.xxxx)/denom;`
			`return numerator / denom;`
			`}`

			`float3 uigamma_large_z_impl(const float3 s, const float3 z)`
			`{`
			`// Float3 version:`
			`const float3 numerator = pow(z, s) * exp(-z);`
			`float3 denom = 7.0.xxx + z - s;`
			`denom = 5.0.xxx + z - s + (3.0*s - 9.0.xxx)/denom;`
			`denom = 3.0.xxx + z - s + (2.0*s - 4.0.xxx)/denom;`
			`denom = 1.0.xxx + z - s + (s - 1.0.xxx)/denom;`
			`return numerator / denom;`
			`}`

			`float2 uigamma_large_z_impl(const float2 s, const float2 z)`
			`{`
			`// Float2 version:`
			`const float2 numerator = pow(z, s) * exp(-z);`
			`float2 denom = 7.0.xx + z - s;`
			`denom = 5.0.xx + z - s + (3.0*s - 9.0.xx)/denom;`
			`denom = 3.0.xx + z - s + (2.0*s - 4.0.xx)/denom;`
			`denom = 1.0.xx + z - s + (s - 1.0.xx)/denom;`
			`return numerator / denom;`
			`}`

			`float uigamma_large_z_impl(const float s, const float z)`
			`{`
			`// Float version:`
			`const float numerator = pow(z, s) * exp(-z);`
			`float denom = 7.0 + z - s;`
			`denom = 5.0 + z - s + (3.0*s - 9.0)/denom;`
			`denom = 3.0 + z - s + (2.0*s - 4.0)/denom;`
			`denom = 1.0 + z - s + (s - 1.0)/denom;`
			`return numerator / denom;`
			`}`

			`// Normalized lower incomplete gamma function for small s (implementation):`
			`float4 normalized_ligamma_impl(const float4 s, const float4 z,`
			`const float4 s_inv, const float4 gamma_s_inv)`
			`{`
			`// Requires: 1.) s < ~0.5`
			`// 2.) s_inv = 1/s (precomputed for outside reuse)`
			`// 3.) gamma_s_inv = 1/gamma(s) (precomputed for outside reuse)`
			`// Returns: Approximate the normalized lower incomplete gamma function`
			`// for s < 0.5. Since we only care about s < 0.5, we only need`
			`// to evaluate two branches (not four) based on z. Each branch`
			`// uses four terms, with a max relative error of ~0.00182. The`
			`// branch threshold and specifics were adapted for fewer terms`
			`// from Gil/Segura/Temme's paper here:`
			`// http://oai.cwi.nl/oai/asset/20433/20433B.pdf`
			`// Evaluate both branches: Real branches test slower even when available.`
			`static const float4 thresh = 0.775075.xxxx;`
			`const bool4 z_is_large = z > thresh;`
			`const float4 large_z = 1.0.xxxx - uigamma_large_z_impl(s, z) * gamma_s_inv;`
			`const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;`
			`// Combine the results from both branches:`
			`return large_z * float4(z_is_large.xxxx) + small_z * float4(!z_is_large.xxxx);`
			`}`

			`float3 normalized_ligamma_impl(const float3 s, const float3 z,`
			`const float3 s_inv, const float3 gamma_s_inv)`
			`{`
			`// Float3 version:`
			`static const float3 thresh = 0.775075.xxx;`
			`const bool3 z_is_large = z > thresh;`
			`const float3 large_z = 1.0.xxx - uigamma_large_z_impl(s, z) * gamma_s_inv;`
			`const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;`
			`return large_z * float3(z_is_large.xxx) + small_z * float3(!z_is_large.xxx);`
			`}`

			`float2 normalized_ligamma_impl(const float2 s, const float2 z,`
			`const float2 s_inv, const float2 gamma_s_inv)`
			`{`
			`// Float2 version:`
			`static const float2 thresh = 0.775075.xx;`
			`const bool2 z_is_large = z > thresh;`
			`const float2 large_z = 1.0.xx - uigamma_large_z_impl(s, z) * gamma_s_inv;`
			`const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;`
			`return large_z * float2(z_is_large.xx) + small_z * float2(!z_is_large.xx);`
			`}`

			`float normalized_ligamma_impl(const float s, const float z,`
			`const float s_inv, const float gamma_s_inv)`
			`{`
			`// Float version:`
			`static const float thresh = 0.775075;`
			`const bool z_is_large = z > thresh;`
			`const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;`
			`const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;`
			`return large_z * float(z_is_large) + small_z * float(!z_is_large);`
			`}`

			`// Normalized lower incomplete gamma function for small s:`
			`float4 normalized_ligamma(const float4 s, const float4 z)`
			`{`
			`// Requires: s < ~0.5`
			`// Returns: Approximate the normalized lower incomplete gamma function`
			`// for s < 0.5. See normalized_ligamma_impl() for details.`
			`const float4 s_inv = 1.0.xxxx/s;`
			`const float4 gamma_s_inv = 1.0.xxxx/gamma_impl(s, s_inv);`
			`return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);`
			`}`

			`float3 normalized_ligamma(const float3 s, const float3 z)`
			`{`
			`// Float3 version:`
			`const float3 s_inv = 1.0.xxx/s;`
			`const float3 gamma_s_inv = 1.0.xxx/gamma_impl(s, s_inv);`
			`return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);`
			`}`

			`float2 normalized_ligamma(const float2 s, const float2 z)`
			`{`
			`// Float2 version:`
			`const float2 s_inv = 1.0.xx/s;`
			`const float2 gamma_s_inv = 1.0.xx/gamma_impl(s, s_inv);`
			`return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);`
			`}`

			`float normalized_ligamma(const float s, const float z)`
			`{`
			`// Float version:`
			`const float s_inv = 1.0/s;`
			`const float gamma_s_inv = 1.0/gamma_impl(s, s_inv);`
			`return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);`
			`}`


			`#endif // SPECIAL_FUNCTIONS_H`