13#if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__)
14#define VCL_NAMESPACE vcl
15DEAL_II_DISABLE_EXTRA_DIAGNOSTICS
16#include "../simd-math/vectorclass.h"
17#include "../simd-math/vectormath_exp.h"
18DEAL_II_ENABLE_EXTRA_DIAGNOSTICS
33#if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__)
36 template <
typename VTYPE>
37 inline DEAL_II_ALWAYS_INLINE VTYPE
fast_pow_impl(VTYPE
const x0,
44 const float ln2f_hi = 0.693359375f;
45 const auto log2e =
static_cast<float>(VM_LOG2E);
47 const float P0logf = 3.3333331174E-1f;
48 const float P1logf = -2.4999993993E-1f;
49 const float P2logf = 2.0000714765E-1f;
50 const float P3logf = -1.6668057665E-1f;
51 const float P4logf = 1.4249322787E-1f;
52 const float P5logf = -1.2420140846E-1f;
53 const float P6logf = 1.1676998740E-1f;
54 const float P7logf = -1.1514610310E-1f;
55 const float P8logf = 7.0376836292E-2f;
57 const float p2expf = 1.f/2.f;
58 const float p3expf = 1.f/6.f;
59 const float p4expf = 1.f/24.f;
60 const float p5expf = 1.f/120.f;
61 const float p6expf = 1.f/720.f;
62 const float p7expf = 1.f/5040.f;
64 typedef decltype(roundi(x0)) ITYPE;
65 typedef decltype(x0 < x0) BVTYPE;
69 VTYPE ef, e1, e2, e3, ee;
71 VTYPE lg, lg1, lgerr, x2err, v;
78 BVTYPE overflow, underflow;
88 blend = x >
static_cast<float>(VM_SQRT2 * 0.5);
89 x = if_add(!blend, x, x);
94 lg1 = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
99 ef = if_add(blend, ef, 1.0f);
104 yr = mul_sub(ef, y, e1);
107 lg = nmul_add(0.5f, x2, x) + lg1;
111 x2err = mul_sub(0.5f*x, x, 0.5f * x2);
113 lgerr = mul_add(0.5f, x2, lg - x) - lg1;
116 e2 = round(lg * y *
static_cast<float>(VM_LOG2E));
118 v = mul_sub(lg, y, e2 * ln2f_hi);
121 v -= mul_sub(lgerr + x2err, y, yr *
static_cast<float>(VM_LN2));
129 x = nmul_add(e3,
static_cast<float>(VM_LN2), x);
133 z = polynomial_5(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf)*x2 + x + 1.0f;
139 ej = ei + (ITYPE(reinterpret_i(abs(z))) >> 23);
142 z = reinterpret_f(ITYPE(reinterpret_i(z)) + (ei << 23));
147 overflow = BVTYPE(ej >= 0x0FF) | (ee > 300.f);
148 underflow = BVTYPE(ej <= 0x000) | (ee < -300.f);
149 if (horizontal_or(overflow | underflow)) {
151 z = select(underflow, VTYPE(0.f), z);
152 z = select(overflow, infinite_vec<VTYPE>(), z);
157 xzero = is_zero_or_subnormal(x0);
158 z = wm_pow_case_x0(xzero, y, z);
T pow(const T x, const T b)
T fast_pow_impl(const T x, const T b, const Bias)