The best approach for an erfcx()
implementation I have found so far is based on the following paper:
M. M. Shepherd and J. G. Laframboise, "Chebyshev Approximation of (1 + 2 x) exp(x2) erfc x in 0 ≤ x < ∞." Mathematics of Computation, Volume 36, No. 153, January 1981, pp. 249-253 (online)
The paper proposes clever transformations that map the scaled complementary error function to a tightly bounded helper function that is amenable to straightforward polynomial approximation. I have experimented with variations of the transformations for the sake of performance, but all of these have had negative impact on accuracy. The choice of the constant K in the transformation (x - K) / (x + K) has a non-obvious relationship with the accuracy of the core approximation. I empirically determined "optimal" values, which differ from the paper.
The transformations for the arguments to the core approximation and the intermediate results back into into erfcx
results incur additional rounding errors. To mitigate their impact on accuracy, we need to apply compensation steps, which I outlined in some detail in my earlier question & answer regarding erfcf
. The availability of FMA greatly simplifies this task.
The resulting single-precision code looks as follows:
/*
* Based on: M. M. Shepherd and J. G. Laframboise, "Chebyshev Approximation of
* (1+2x)exp(x^2)erfc x in 0 <= x < INF", Mathematics of Computation, Vol. 36,
* No. 153, January 1981, pp. 249-253.
*
*/
float my_erfcxf (float x)
{
float a, d, e, m, p, q, r, s, t;
a = fmaxf (x, 0.0f - x); // NaN-preserving absolute value computation
/* Compute q = (a-2)/(a+2) accurately. [0,INF) -> [-1,1] */
m = a - 2.0f;
p = a + 2.0f;
#if FAST_RCP_SSE
r = fast_recipf_sse (p);
#else
r = 1.0f / p;
#endif
q = m * r;
t = fmaf (q + 1.0f, -2.0f, a);
e = fmaf (q, -a, t);
q = fmaf (r, e, q);
/* Approximate (1+2*a)*exp(a*a)*erfc(a) as p(q)+1 for q in [-1,1] */
p = 0x1.f10000p-15f; // 5.92470169e-5
p = fmaf (p, q, 0x1.521cc6p-13f); // 1.61224554e-4
p = fmaf (p, q, -0x1.6b4ffep-12f); // -3.46481771e-4
p = fmaf (p, q, -0x1.6e2a7cp-10f); // -1.39681227e-3
p = fmaf (p, q, 0x1.3c1d7ep-10f); // 1.20588380e-3
p = fmaf (p, q, 0x1.1cc236p-07f); // 8.69014394e-3
p = fmaf (p, q, -0x1.069940p-07f); // -8.01387429e-3
p = fmaf (p, q, -0x1.bc1b6cp-05f); // -5.42122945e-2
p = fmaf (p, q, 0x1.4ff8acp-03f); // 1.64048523e-1
p = fmaf (p, q, -0x1.54081ap-03f); // -1.66031078e-1
p = fmaf (p, q, -0x1.7bf5cep-04f); // -9.27637145e-2
p = fmaf (p, q, 0x1.1ba03ap-02f); // 2.76978403e-1
/* Divide (1+p) by (1+2*a) ==> exp(a*a)*erfc(a) */
d = a + 0.5f;
#if FAST_RCP_SSE
r = fast_recipf_sse (d);
#else
r = 1.0f / d;
#endif
r = r * 0.5f;
q = fmaf (p, r, r); // q = (p+1)/(1+2*a)
t = q + q;
e = (p - q) + fmaf (t, -a, 1.0f); // residual: (p+1)-q*(1+2*a)
r = fmaf (e, r, q);
if (a > 0x1.fffffep127f) r = 0.0f; // 3.40282347e+38 // handle INF argument
/* Handle negative arguments: erfcx(x) = 2*exp(x*x) - erfcx(|x|) */
if (x < 0.0f) {
s = x * x;
d = fmaf (x, x, -s);
e = expf (s);
r = e - r;
r = fmaf (e, d + d, r);
r = r + e;
if (e > 0x1.fffffep127f) r = e; // 3.40282347e+38 // avoid creating NaN
}
return r;
}
The maximum error of this implementation in the negative half-plane will depend on the accuracy of the standard math library's implementation of expf()
. Using the Intel compiler, version 13.1.3.198, and compiling with /fp:strict
I observed a maximum error of 2.00450 ulps in the positive half-plane and 2.38412 ulps in the negative half-plane in an exhaustive test. Best I can tell at this time, a faithfully-rounded implementation of expf()
will result in a maximum error of less than 2.5 ulps.
Note that while the code contains two divisions, which are potentially slow operations, they occur in the special form of reciprocals, and are thus amenable to the use of fast reciprocal approximations on many platforms. As long as the reciprocal approximation is faithfully rounded, the impact on erfcxf()
accuracy seems to be negligible, based on experiments. Even slightly larger errors, such as in the fast SSE version (with maximum error < 2.0 ulps) appear to have only a minor impact.
/* Fast reciprocal approximation. HW approximation plus Newton iteration */
float fast_recipf_sse (float a)
{
__m128 t;
float e, r;
t = _mm_set_ss (a);
t = _mm_rcp_ss (t);
_mm_store_ss (&r, t);
e = fmaf (0.0f - a, r, 1.0f);
r = fmaf (e, r, r);
return r;
}
Double-precision version erfcx()
is structurally identical to the single-precision version erfcxf()
, but requires a minimax polynomial approximation with many more terms. This presents a challenge when optimizing the core approximation, as many heuristics will break down when the search space is very large. The coefficients below represent my best solution to date, and there is definitely room for improvement. Building with the Intel compiler and /fp:strict
, and using 232 random test vectors, the maximum error observed was 2.82940 ulps in the positive half-plane and 2.79027 ulps in the negative half-plane.
double my_erfcx (double x)
{
double a, d, e, m, p, q, r, s, t;
a = fmax (x, 0.0 - x); // NaN preserving absolute value computation
/* Compute q = (a-4)/(a+4) accurately. [0,INF) -> [-1,1] */
m = a - 4.0;
p = a + 4.0;
r = 1.0 / p;
q = m * r;
t = fma (q + 1.0, -4.0, a);
e = fma (q, -a, t);
q = fma (r, e, q);
/* Approximate (1+2*a)*exp(a*a)*erfc(a) as p(q)+1 for q in [-1,1] */
s = q * q;
p = 0x1.edcad78fc8044p-31; // 8.9820305531190140e-10
t = 0x1.b1548f14735d1p-30; // 1.5764464777959401e-09
p = fma (p, s, -0x1.a1ad2e6c4a7a8p-27); // -1.2155985739342269e-08
t = fma (t, s, -0x1.1985b48f08574p-26); // -1.6386753783877791e-08
p = fma (p, s, 0x1.c6a8093ac4f83p-24); // 1.0585794011876720e-07
t = fma (t, s, 0x1.31c2b2b44b731p-24); // 7.1190423171700940e-08
p = fma (p, s, -0x1.b87373facb29fp-21); // -8.2040389712752056e-07
t = fma (t, s, 0x1.3fef1358803b7p-22); // 2.9796165315625938e-07
p = fma (p, s, 0x1.7eec072bb0be3p-18); // 5.7059822144459833e-06
t = fma (t, s, -0x1.78a680a741c4ap-17); // -1.1225056665965572e-05
p = fma (p, s, -0x1.9951f39295cf4p-16); // -2.4397380523258482e-05
t = fma (t, s, 0x1.3be1255ce180bp-13); // 1.5062307184282616e-04
p = fma (p, s, -0x1.a1df71176b791p-13); // -1.9925728768782324e-04
t = fma (t, s, -0x1.8d4aaa0099bc8p-11); // -7.5777369791018515e-04
p = fma (p, s, 0x1.49c673066c831p-8); // 5.0319701025945277e-03
t = fma (t, s, -0x1.0962386ea02b7p-6); // -1.6197733983519948e-02
p = fma (p, s, 0x1.3079edf465cc3p-5); // 3.7167515521269866e-02
t = fma (t, s, -0x1.0fb06dfedc4ccp-4); // -6.6330365820039094e-02
p = fma (p, s, 0x1.7fee004e266dfp-4); // 9.3732834999538536e-02
t = fma (t, s, -0x1.9ddb23c3e14d2p-4); // -1.0103906603588378e-01
p = fma (p, s, 0x1.16ecefcfa4865p-4); // 6.8097054254651804e-02
t = fma (t, s, 0x1.f7f5df66fc349p-7); // 1.5379652102610957e-02
p = fma (p, q, t);
p = fma (p, q, -0x1.1df1ad154a27fp-3); // -1.3962111684056208e-01
p = fma (p, q, 0x1.dd2c8b74febf6p-3); // 2.3299511862555250e-01
/* Divide (1+p) by (1+2*a) ==> exp(a*a)*erfc(a) */
d = a + 0.5;
r = 1.0 / d;
r = r * 0.5;
q = fma (p, r, r); // q = (p+1)/(1+2*a)
t = q + q;
e = (p - q) + fma (t, -a, 1.0); // residual: (p+1)-q*(1+2*a)
r = fma (e, r, q);
/* Handle argument of infinity */
if (a > 0x1.fffffffffffffp1023) r = 0.0;
/* Handle negative arguments: erfcx(x) = 2*exp(x*x) - erfcx(|x|) */
if (x < 0.0) {
s = x * x;
d = fma (x, x, -s);
e = exp (s);
r = e - r;
r = fma (e, d + d, r);
r = r + e;
if (e > 0x1.fffffffffffffp1023) r = e; // avoid creating NaN
}
return r;
}