Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_exp_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /* SIMD (SSE4) implementation of exp
11  Inspired by Intel Approximate Math library, and based on the
12  corresponding algorithms of the cephes math library
13 */
14 
15 /* Copyright (C) 2007 Julien Pommier
16 
17  This software is provided 'as-is', without any express or implied
18  warranty. In no event will the authors be held liable for any damages
19  arising from the use of this software.
20 
21  Permission is granted to anyone to use this software for any purpose,
22  including commercial applications, and to alter it and redistribute it
23  freely, subject to the following restrictions:
24 
25  1. The origin of this software must not be misrepresented; you must not
26  claim that you wrote the original software. If you use this software
27  in a product, an acknowledgment in the product documentation would be
28  appreciated but is not required.
29  2. Altered source versions must be plainly marked as such, and must not be
30  misrepresented as being the original software.
31  3. This notice may not be removed or altered from any source distribution.
32 
33  (this is the zlib license)
34 */
35 
82 #include <inttypes.h>
83 #include <math.h>
84 #include <stdio.h>
85 
86 #ifndef INCLUDED_volk_32f_exp_32f_a_H
87 #define INCLUDED_volk_32f_exp_32f_a_H
88 
89 #ifdef LV_HAVE_SSE2
90 #include <emmintrin.h>
91 
92 static inline void
93 volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points)
94 {
95  float* bPtr = bVector;
96  const float* aPtr = aVector;
97 
98  unsigned int number = 0;
99  unsigned int quarterPoints = num_points / 4;
100 
101  // Declare variables and constants
102  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
103  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
104  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
105  __m128i emm0, pi32_0x7f;
106 
107  one = _mm_set1_ps(1.0);
108  exp_hi = _mm_set1_ps(88.3762626647949);
109  exp_lo = _mm_set1_ps(-88.3762626647949);
110  log2EF = _mm_set1_ps(1.44269504088896341);
111  half = _mm_set1_ps(0.5);
112  exp_C1 = _mm_set1_ps(0.693359375);
113  exp_C2 = _mm_set1_ps(-2.12194440e-4);
114  pi32_0x7f = _mm_set1_epi32(0x7f);
115 
116  exp_p0 = _mm_set1_ps(1.9875691500e-4);
117  exp_p1 = _mm_set1_ps(1.3981999507e-3);
118  exp_p2 = _mm_set1_ps(8.3334519073e-3);
119  exp_p3 = _mm_set1_ps(4.1665795894e-2);
120  exp_p4 = _mm_set1_ps(1.6666665459e-1);
121  exp_p5 = _mm_set1_ps(5.0000001201e-1);
122 
123  for (; number < quarterPoints; number++) {
124  aVal = _mm_load_ps(aPtr);
125  tmp = _mm_setzero_ps();
126 
127  aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
128 
129  /* express exp(x) as exp(g + n*log(2)) */
130  fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
131 
132  emm0 = _mm_cvttps_epi32(fx);
133  tmp = _mm_cvtepi32_ps(emm0);
134 
135  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
136  fx = _mm_sub_ps(tmp, mask);
137 
138  tmp = _mm_mul_ps(fx, exp_C1);
139  z = _mm_mul_ps(fx, exp_C2);
140  aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
141  z = _mm_mul_ps(aVal, aVal);
142 
143  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
144  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
145  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
146  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
147  y = _mm_add_ps(y, one);
148 
149  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
150 
151  pow2n = _mm_castsi128_ps(emm0);
152  bVal = _mm_mul_ps(y, pow2n);
153 
154  _mm_store_ps(bPtr, bVal);
155  aPtr += 4;
156  bPtr += 4;
157  }
158 
159  number = quarterPoints * 4;
160  for (; number < num_points; number++) {
161  *bPtr++ = expf(*aPtr++);
162  }
163 }
164 
165 #endif /* LV_HAVE_SSE2 for aligned */
166 
167 
168 #ifdef LV_HAVE_GENERIC
169 
170 static inline void
171 volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points)
172 {
173  float* bPtr = bVector;
174  const float* aPtr = aVector;
175  unsigned int number = 0;
176 
177  for (number = 0; number < num_points; number++) {
178  *bPtr++ = expf(*aPtr++);
179  }
180 }
181 
182 #endif /* LV_HAVE_GENERIC */
183 
184 #endif /* INCLUDED_volk_32f_exp_32f_a_H */
185 
186 #ifndef INCLUDED_volk_32f_exp_32f_u_H
187 #define INCLUDED_volk_32f_exp_32f_u_H
188 
189 #ifdef LV_HAVE_SSE2
190 #include <emmintrin.h>
191 
192 static inline void
193 volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points)
194 {
195  float* bPtr = bVector;
196  const float* aPtr = aVector;
197 
198  unsigned int number = 0;
199  unsigned int quarterPoints = num_points / 4;
200 
201  // Declare variables and constants
202  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
203  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
204  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
205  __m128i emm0, pi32_0x7f;
206 
207  one = _mm_set1_ps(1.0);
208  exp_hi = _mm_set1_ps(88.3762626647949);
209  exp_lo = _mm_set1_ps(-88.3762626647949);
210  log2EF = _mm_set1_ps(1.44269504088896341);
211  half = _mm_set1_ps(0.5);
212  exp_C1 = _mm_set1_ps(0.693359375);
213  exp_C2 = _mm_set1_ps(-2.12194440e-4);
214  pi32_0x7f = _mm_set1_epi32(0x7f);
215 
216  exp_p0 = _mm_set1_ps(1.9875691500e-4);
217  exp_p1 = _mm_set1_ps(1.3981999507e-3);
218  exp_p2 = _mm_set1_ps(8.3334519073e-3);
219  exp_p3 = _mm_set1_ps(4.1665795894e-2);
220  exp_p4 = _mm_set1_ps(1.6666665459e-1);
221  exp_p5 = _mm_set1_ps(5.0000001201e-1);
222 
223 
224  for (; number < quarterPoints; number++) {
225  aVal = _mm_loadu_ps(aPtr);
226  tmp = _mm_setzero_ps();
227 
228  aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
229 
230  /* express exp(x) as exp(g + n*log(2)) */
231  fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
232 
233  emm0 = _mm_cvttps_epi32(fx);
234  tmp = _mm_cvtepi32_ps(emm0);
235 
236  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
237  fx = _mm_sub_ps(tmp, mask);
238 
239  tmp = _mm_mul_ps(fx, exp_C1);
240  z = _mm_mul_ps(fx, exp_C2);
241  aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
242  z = _mm_mul_ps(aVal, aVal);
243 
244  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
245  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
246  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
247  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
248  y = _mm_add_ps(y, one);
249 
250  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
251 
252  pow2n = _mm_castsi128_ps(emm0);
253  bVal = _mm_mul_ps(y, pow2n);
254 
255  _mm_storeu_ps(bPtr, bVal);
256  aPtr += 4;
257  bPtr += 4;
258  }
259 
260  number = quarterPoints * 4;
261  for (; number < num_points; number++) {
262  *bPtr++ = expf(*aPtr++);
263  }
264 }
265 
266 #endif /* LV_HAVE_SSE2 for unaligned */
267 
268 
269 #ifdef LV_HAVE_GENERIC
270 
271 static inline void
272 volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
273 {
274  float* bPtr = bVector;
275  const float* aPtr = aVector;
276  unsigned int number = 0;
277 
278  for (number = 0; number < num_points; number++) {
279  *bPtr++ = expf(*aPtr++);
280  }
281 }
282 
283 #endif /* LV_HAVE_GENERIC */
284 
285 #endif /* INCLUDED_volk_32f_exp_32f_u_H */
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5565
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_exp_32f_a_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:93
static void volk_32f_exp_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:272
static void volk_32f_exp_32f_u_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:193
static void volk_32f_exp_32f_a_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:171