Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_tanh_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
55 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
56 #define INCLUDED_volk_32f_tanh_32f_a_H
57 
58 #include <inttypes.h>
59 #include <math.h>
60 #include <stdio.h>
61 #include <string.h>
62 
63 
64 #ifdef LV_HAVE_GENERIC
65 
66 static inline void
67 volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
68 {
69  unsigned int number = 0;
70  float* cPtr = cVector;
71  const float* aPtr = aVector;
72  for (; number < num_points; number++) {
73  *cPtr++ = tanhf(*aPtr++);
74  }
75 }
76 
77 #endif /* LV_HAVE_GENERIC */
78 
79 
80 #ifdef LV_HAVE_GENERIC
81 
82 static inline void
83 volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
84 {
85  float* cPtr = cVector;
86  const float* aPtr = aVector;
87  for (unsigned int number = 0; number < num_points; number++) {
88  if (*aPtr > 4.97)
89  *cPtr++ = 1;
90  else if (*aPtr <= -4.97)
91  *cPtr++ = -1;
92  else {
93  float x2 = (*aPtr) * (*aPtr);
94  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
95  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
96  *cPtr++ = a / b;
97  aPtr++;
98  }
99  }
100 }
101 
102 #endif /* LV_HAVE_GENERIC */
103 
104 
105 #ifdef LV_HAVE_SSE
106 #include <xmmintrin.h>
107 
108 static inline void
109 volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
110 {
111  unsigned int number = 0;
112  const unsigned int quarterPoints = num_points / 4;
113 
114  float* cPtr = cVector;
115  const float* aPtr = aVector;
116 
117  __m128 aVal, cVal, x2, a, b;
118  __m128 const1, const2, const3, const4, const5, const6;
119  const1 = _mm_set_ps1(135135.0f);
120  const2 = _mm_set_ps1(17325.0f);
121  const3 = _mm_set_ps1(378.0f);
122  const4 = _mm_set_ps1(62370.0f);
123  const5 = _mm_set_ps1(3150.0f);
124  const6 = _mm_set_ps1(28.0f);
125  for (; number < quarterPoints; number++) {
126 
127  aVal = _mm_load_ps(aPtr);
128  x2 = _mm_mul_ps(aVal, aVal);
129  a = _mm_mul_ps(
130  aVal,
131  _mm_add_ps(
132  const1,
133  _mm_mul_ps(x2,
134  _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
135  b = _mm_add_ps(
136  const1,
137  _mm_mul_ps(
138  x2,
139  _mm_add_ps(const4,
140  _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
141 
142  cVal = _mm_div_ps(a, b);
143 
144  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
145 
146  aPtr += 4;
147  cPtr += 4;
148  }
149 
150  number = quarterPoints * 4;
151  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
152 }
153 #endif /* LV_HAVE_SSE */
154 
155 
156 #ifdef LV_HAVE_AVX
157 #include <immintrin.h>
158 
159 static inline void
160 volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
161 {
162  unsigned int number = 0;
163  const unsigned int eighthPoints = num_points / 8;
164 
165  float* cPtr = cVector;
166  const float* aPtr = aVector;
167 
168  __m256 aVal, cVal, x2, a, b;
169  __m256 const1, const2, const3, const4, const5, const6;
170  const1 = _mm256_set1_ps(135135.0f);
171  const2 = _mm256_set1_ps(17325.0f);
172  const3 = _mm256_set1_ps(378.0f);
173  const4 = _mm256_set1_ps(62370.0f);
174  const5 = _mm256_set1_ps(3150.0f);
175  const6 = _mm256_set1_ps(28.0f);
176  for (; number < eighthPoints; number++) {
177 
178  aVal = _mm256_load_ps(aPtr);
179  x2 = _mm256_mul_ps(aVal, aVal);
180  a = _mm256_mul_ps(
181  aVal,
182  _mm256_add_ps(
183  const1,
184  _mm256_mul_ps(
185  x2,
186  _mm256_add_ps(const2,
187  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
188  b = _mm256_add_ps(
189  const1,
190  _mm256_mul_ps(
191  x2,
192  _mm256_add_ps(
193  const4,
194  _mm256_mul_ps(x2,
195  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
196 
197  cVal = _mm256_div_ps(a, b);
198 
199  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
200 
201  aPtr += 8;
202  cPtr += 8;
203  }
204 
205  number = eighthPoints * 8;
206  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
207 }
208 #endif /* LV_HAVE_AVX */
209 
210 #if LV_HAVE_AVX && LV_HAVE_FMA
211 #include <immintrin.h>
212 
213 static inline void
214 volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
215 {
216  unsigned int number = 0;
217  const unsigned int eighthPoints = num_points / 8;
218 
219  float* cPtr = cVector;
220  const float* aPtr = aVector;
221 
222  __m256 aVal, cVal, x2, a, b;
223  __m256 const1, const2, const3, const4, const5, const6;
224  const1 = _mm256_set1_ps(135135.0f);
225  const2 = _mm256_set1_ps(17325.0f);
226  const3 = _mm256_set1_ps(378.0f);
227  const4 = _mm256_set1_ps(62370.0f);
228  const5 = _mm256_set1_ps(3150.0f);
229  const6 = _mm256_set1_ps(28.0f);
230  for (; number < eighthPoints; number++) {
231 
232  aVal = _mm256_load_ps(aPtr);
233  x2 = _mm256_mul_ps(aVal, aVal);
234  a = _mm256_mul_ps(
235  aVal,
236  _mm256_fmadd_ps(
237  x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
238  b = _mm256_fmadd_ps(
239  x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
240 
241  cVal = _mm256_div_ps(a, b);
242 
243  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
244 
245  aPtr += 8;
246  cPtr += 8;
247  }
248 
249  number = eighthPoints * 8;
250  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
251 }
252 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
253 
254 #endif /* INCLUDED_volk_32f_tanh_32f_a_H */
255 
256 
257 #ifndef INCLUDED_volk_32f_tanh_32f_u_H
258 #define INCLUDED_volk_32f_tanh_32f_u_H
259 
260 #include <inttypes.h>
261 #include <math.h>
262 #include <stdio.h>
263 #include <string.h>
264 
265 
266 #ifdef LV_HAVE_SSE
267 #include <xmmintrin.h>
268 
269 static inline void
270 volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
271 {
272  unsigned int number = 0;
273  const unsigned int quarterPoints = num_points / 4;
274 
275  float* cPtr = cVector;
276  const float* aPtr = aVector;
277 
278  __m128 aVal, cVal, x2, a, b;
279  __m128 const1, const2, const3, const4, const5, const6;
280  const1 = _mm_set_ps1(135135.0f);
281  const2 = _mm_set_ps1(17325.0f);
282  const3 = _mm_set_ps1(378.0f);
283  const4 = _mm_set_ps1(62370.0f);
284  const5 = _mm_set_ps1(3150.0f);
285  const6 = _mm_set_ps1(28.0f);
286  for (; number < quarterPoints; number++) {
287 
288  aVal = _mm_loadu_ps(aPtr);
289  x2 = _mm_mul_ps(aVal, aVal);
290  a = _mm_mul_ps(
291  aVal,
292  _mm_add_ps(
293  const1,
294  _mm_mul_ps(x2,
295  _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
296  b = _mm_add_ps(
297  const1,
298  _mm_mul_ps(
299  x2,
300  _mm_add_ps(const4,
301  _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
302 
303  cVal = _mm_div_ps(a, b);
304 
305  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
306 
307  aPtr += 4;
308  cPtr += 4;
309  }
310 
311  number = quarterPoints * 4;
312  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
313 }
314 #endif /* LV_HAVE_SSE */
315 
316 
317 #ifdef LV_HAVE_AVX
318 #include <immintrin.h>
319 
320 static inline void
321 volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
322 {
323  unsigned int number = 0;
324  const unsigned int eighthPoints = num_points / 8;
325 
326  float* cPtr = cVector;
327  const float* aPtr = aVector;
328 
329  __m256 aVal, cVal, x2, a, b;
330  __m256 const1, const2, const3, const4, const5, const6;
331  const1 = _mm256_set1_ps(135135.0f);
332  const2 = _mm256_set1_ps(17325.0f);
333  const3 = _mm256_set1_ps(378.0f);
334  const4 = _mm256_set1_ps(62370.0f);
335  const5 = _mm256_set1_ps(3150.0f);
336  const6 = _mm256_set1_ps(28.0f);
337  for (; number < eighthPoints; number++) {
338 
339  aVal = _mm256_loadu_ps(aPtr);
340  x2 = _mm256_mul_ps(aVal, aVal);
341  a = _mm256_mul_ps(
342  aVal,
343  _mm256_add_ps(
344  const1,
345  _mm256_mul_ps(
346  x2,
347  _mm256_add_ps(const2,
348  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
349  b = _mm256_add_ps(
350  const1,
351  _mm256_mul_ps(
352  x2,
353  _mm256_add_ps(
354  const4,
355  _mm256_mul_ps(x2,
356  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
357 
358  cVal = _mm256_div_ps(a, b);
359 
360  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
361 
362  aPtr += 8;
363  cPtr += 8;
364  }
365 
366  number = eighthPoints * 8;
367  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
368 }
369 #endif /* LV_HAVE_AVX */
370 
371 #if LV_HAVE_AVX && LV_HAVE_FMA
372 #include <immintrin.h>
373 
374 static inline void
375 volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
376 {
377  unsigned int number = 0;
378  const unsigned int eighthPoints = num_points / 8;
379 
380  float* cPtr = cVector;
381  const float* aPtr = aVector;
382 
383  __m256 aVal, cVal, x2, a, b;
384  __m256 const1, const2, const3, const4, const5, const6;
385  const1 = _mm256_set1_ps(135135.0f);
386  const2 = _mm256_set1_ps(17325.0f);
387  const3 = _mm256_set1_ps(378.0f);
388  const4 = _mm256_set1_ps(62370.0f);
389  const5 = _mm256_set1_ps(3150.0f);
390  const6 = _mm256_set1_ps(28.0f);
391  for (; number < eighthPoints; number++) {
392 
393  aVal = _mm256_loadu_ps(aPtr);
394  x2 = _mm256_mul_ps(aVal, aVal);
395  a = _mm256_mul_ps(
396  aVal,
397  _mm256_fmadd_ps(
398  x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
399  b = _mm256_fmadd_ps(
400  x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
401 
402  cVal = _mm256_div_ps(a, b);
403 
404  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
405 
406  aPtr += 8;
407  cPtr += 8;
408  }
409 
410  number = eighthPoints * 8;
411  volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
412 }
413 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
414 
415 #endif /* INCLUDED_volk_32f_tanh_32f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_tanh_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:321
static void volk_32f_tanh_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:67
static void volk_32f_tanh_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:160
static void volk_32f_tanh_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:109
static void volk_32f_tanh_32f_series(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:83
static void volk_32f_tanh_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:270