ryujin 2.1.1 revision 6759a3f00bf045f3527c5e7e7dfd18c7d96a6edb
simd_fast_pow.template.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0
3// Copyright (C) 2014 - 2022 by Agner Fog
4// Copyright (C) 2023 by the ryujin authors
5//
6
7#pragma once
8
9#include "simd.h"
10
11#include <cmath>
12
13#if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__)
14#define VCL_NAMESPACE vcl
15DEAL_II_DISABLE_EXTRA_DIAGNOSTICS
16#include "../simd-math/vectorclass.h"
17#include "../simd-math/vectormath_exp.h"
18DEAL_II_ENABLE_EXTRA_DIAGNOSTICS
19#undef VCL_NAMESPACE
20
21#else
22namespace ryujin
23{
24 template <typename T>
25 T fast_pow_impl(const T x, const T b, const Bias)
26 {
27 return std::pow(x, b);
28 }
29} // namespace ryujin
30#endif
31
32
33#if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__)
34namespace ryujin
35{
36 template <typename VTYPE>
37 inline DEAL_II_ALWAYS_INLINE VTYPE fast_pow_impl(VTYPE const x0,
38 VTYPE const y,
39 Bias)
40 {
41 /* clang-format off */
42 using namespace vcl;
43
44 const float ln2f_hi = 0.693359375f; // log(2), split in two for extended precision
45 const auto log2e = static_cast<float>(VM_LOG2E); // 1/log(2)
46
47 const float P0logf = 3.3333331174E-1f; // coefficients for logarithm expansion
48 const float P1logf = -2.4999993993E-1f;
49 const float P2logf = 2.0000714765E-1f;
50 const float P3logf = -1.6668057665E-1f;
51 const float P4logf = 1.4249322787E-1f;
52 const float P5logf = -1.2420140846E-1f;
53 const float P6logf = 1.1676998740E-1f;
54 const float P7logf = -1.1514610310E-1f;
55 const float P8logf = 7.0376836292E-2f;
56
57 const float p2expf = 1.f/2.f; // coefficients for Taylor expansion of exp
58 const float p3expf = 1.f/6.f;
59 const float p4expf = 1.f/24.f;
60 const float p5expf = 1.f/120.f;
61 const float p6expf = 1.f/720.f;
62 const float p7expf = 1.f/5040.f;
63
64 typedef decltype(roundi(x0)) ITYPE; // integer vector type
65 typedef decltype(x0 < x0) BVTYPE; // boolean vector type
66
67 // data vectors
68 VTYPE x, x1, x2; // x variable
69 VTYPE ef, e1, e2, e3, ee; // exponent
70 VTYPE yr; // remainder
71 VTYPE lg, lg1, lgerr, x2err, v; // logarithm
72 VTYPE z; // pow(x,y)
73 VTYPE yodd(0); // has sign bit set if y is an odd integer
74 // integer vectors
75 ITYPE ei, ej; // exponent
76 // boolean vectors
77 BVTYPE blend, xzero; // x conditions
78 BVTYPE overflow, underflow; // error conditions
79
80 // remove sign
81 x1 = abs(x0);
82
83 // Separate mantissa from exponent
84 // This gives the mantissa * 0.5
85 x = fraction_2(x1);
86
87 // reduce range of x = +/- sqrt(2)/2
88 blend = x > static_cast<float>(VM_SQRT2 * 0.5);
89 x = if_add(!blend, x, x); // conditional add
90
91 // Taylor expansion, high precision
92 x -= 1.0f;
93 x2 = x * x;
94 lg1 = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
95 lg1 *= x2 * x;
96
97 // extract exponent
98 ef = exponent_f(x1);
99 ef = if_add(blend, ef, 1.0f); // conditional add
100
101 // multiply exponent by y
102 // nearest integer e1 goes into exponent of result, remainder yr is added to log
103 e1 = round(ef * y);
104 yr = mul_sub(ef, y, e1); // calculate remainder yr. precision very important here
105
106 // add initial terms to expansion
107 lg = nmul_add(0.5f, x2, x) + lg1; // lg = (x - 0.5f * x2) + lg1;
108
109 // calculate rounding errors in lg
110 // rounding error in multiplication 0.5*x*x
111 x2err = mul_sub(0.5f*x, x, 0.5f * x2);
112 // rounding error in additions and subtractions
113 lgerr = mul_add(0.5f, x2, lg - x) - lg1; // lgerr = ((lg - x) + 0.5f * x2) - lg1;
114
115 // extract something for the exponent
116 e2 = round(lg * y * static_cast<float>(VM_LOG2E));
117 // subtract this from lg, with extra precision
118 v = mul_sub(lg, y, e2 * ln2f_hi);
119
120 // correct for previous rounding errors
121 v -= mul_sub(lgerr + x2err, y, yr * static_cast<float>(VM_LN2)); // v -= (lgerr + x2err) * y - yr * float(VM_LN2) ;
122
123 // exp function
124
125 // extract something for the exponent if possible
126 x = v;
127 e3 = round(x*log2e);
128 // high precision multiplication not needed here because abs(e3) <= 1
129 x = nmul_add(e3, static_cast<float>(VM_LN2), x); // x -= e3 * float(VM_LN2);
130
131 // Taylor polynomial
132 x2 = x * x;
133 z = polynomial_5(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf)*x2 + x + 1.0f;
134
135 // contributions to exponent
136 ee = e1 + e2 + e3;
137 ei = roundi(ee);
138 // biased exponent of result:
139 ej = ei + (ITYPE(reinterpret_i(abs(z))) >> 23);
140
141 // add exponent by integer addition
142 z = reinterpret_f(ITYPE(reinterpret_i(z)) + (ei << 23)); // the extra 0x10000 is shifted out here
143
144
145 // check for overflow and underflow
146
147 overflow = BVTYPE(ej >= 0x0FF) | (ee > 300.f);
148 underflow = BVTYPE(ej <= 0x000) | (ee < -300.f);
149 if (horizontal_or(overflow | underflow)) {
150 // handle errors
151 z = select(underflow, VTYPE(0.f), z);
152 z = select(overflow, infinite_vec<VTYPE>(), z);
153 }
154
155 // check for x == 0
156
157 xzero = is_zero_or_subnormal(x0);
158 z = wm_pow_case_x0(xzero, y, z);
159
160 return z;
161
162 /* clang-format on */
163 }
164} // namespace ryujin
165
166#endif
T pow(const T x, const T b)
T fast_pow_impl(const T x, const T b, const Bias)
Bias
Definition: simd.h:178