55 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
56 #define INCLUDED_volk_32f_tanh_32f_a_H
64 #ifdef LV_HAVE_GENERIC
69 unsigned int number = 0;
70 float* cPtr = cVector;
71 const float* aPtr = aVector;
72 for (; number < num_points; number++) {
73 *cPtr++ = tanhf(*aPtr++);
80 #ifdef LV_HAVE_GENERIC
85 float* cPtr = cVector;
86 const float* aPtr = aVector;
87 for (
unsigned int number = 0; number < num_points; number++) {
90 else if (*aPtr <= -4.97)
93 float x2 = (*aPtr) * (*aPtr);
94 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
95 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
106 #include <xmmintrin.h>
111 unsigned int number = 0;
112 const unsigned int quarterPoints = num_points / 4;
114 float* cPtr = cVector;
115 const float* aPtr = aVector;
117 __m128 aVal, cVal, x2, a, b;
118 __m128 const1, const2, const3, const4, const5, const6;
125 for (; number < quarterPoints; number++) {
150 number = quarterPoints * 4;
157 #include <immintrin.h>
162 unsigned int number = 0;
163 const unsigned int eighthPoints = num_points / 8;
165 float* cPtr = cVector;
166 const float* aPtr = aVector;
168 __m256 aVal, cVal, x2, a, b;
169 __m256 const1, const2, const3, const4, const5, const6;
170 const1 = _mm256_set1_ps(135135.0f);
171 const2 = _mm256_set1_ps(17325.0f);
172 const3 = _mm256_set1_ps(378.0f);
173 const4 = _mm256_set1_ps(62370.0f);
174 const5 = _mm256_set1_ps(3150.0f);
175 const6 = _mm256_set1_ps(28.0f);
176 for (; number < eighthPoints; number++) {
178 aVal = _mm256_load_ps(aPtr);
179 x2 = _mm256_mul_ps(aVal, aVal);
186 _mm256_add_ps(const2,
187 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
195 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
197 cVal = _mm256_div_ps(a, b);
199 _mm256_store_ps(cPtr, cVal);
205 number = eighthPoints * 8;
210 #if LV_HAVE_AVX && LV_HAVE_FMA
211 #include <immintrin.h>
214 volk_32f_tanh_32f_a_avx_fma(
float* cVector,
const float* aVector,
unsigned int num_points)
216 unsigned int number = 0;
217 const unsigned int eighthPoints = num_points / 8;
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
222 __m256 aVal, cVal, x2, a, b;
223 __m256 const1, const2, const3, const4, const5, const6;
224 const1 = _mm256_set1_ps(135135.0f);
225 const2 = _mm256_set1_ps(17325.0f);
226 const3 = _mm256_set1_ps(378.0f);
227 const4 = _mm256_set1_ps(62370.0f);
228 const5 = _mm256_set1_ps(3150.0f);
229 const6 = _mm256_set1_ps(28.0f);
230 for (; number < eighthPoints; number++) {
232 aVal = _mm256_load_ps(aPtr);
233 x2 = _mm256_mul_ps(aVal, aVal);
237 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
239 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
241 cVal = _mm256_div_ps(a, b);
243 _mm256_store_ps(cPtr, cVal);
249 number = eighthPoints * 8;
257 #ifndef INCLUDED_volk_32f_tanh_32f_u_H
258 #define INCLUDED_volk_32f_tanh_32f_u_H
260 #include <inttypes.h>
267 #include <xmmintrin.h>
272 unsigned int number = 0;
273 const unsigned int quarterPoints = num_points / 4;
275 float* cPtr = cVector;
276 const float* aPtr = aVector;
278 __m128 aVal, cVal, x2, a, b;
279 __m128 const1, const2, const3, const4, const5, const6;
286 for (; number < quarterPoints; number++) {
311 number = quarterPoints * 4;
318 #include <immintrin.h>
323 unsigned int number = 0;
324 const unsigned int eighthPoints = num_points / 8;
326 float* cPtr = cVector;
327 const float* aPtr = aVector;
329 __m256 aVal, cVal, x2, a, b;
330 __m256 const1, const2, const3, const4, const5, const6;
331 const1 = _mm256_set1_ps(135135.0f);
332 const2 = _mm256_set1_ps(17325.0f);
333 const3 = _mm256_set1_ps(378.0f);
334 const4 = _mm256_set1_ps(62370.0f);
335 const5 = _mm256_set1_ps(3150.0f);
336 const6 = _mm256_set1_ps(28.0f);
337 for (; number < eighthPoints; number++) {
339 aVal = _mm256_loadu_ps(aPtr);
340 x2 = _mm256_mul_ps(aVal, aVal);
347 _mm256_add_ps(const2,
348 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
356 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
358 cVal = _mm256_div_ps(a, b);
360 _mm256_storeu_ps(cPtr, cVal);
366 number = eighthPoints * 8;
371 #if LV_HAVE_AVX && LV_HAVE_FMA
372 #include <immintrin.h>
375 volk_32f_tanh_32f_u_avx_fma(
float* cVector,
const float* aVector,
unsigned int num_points)
377 unsigned int number = 0;
378 const unsigned int eighthPoints = num_points / 8;
380 float* cPtr = cVector;
381 const float* aPtr = aVector;
383 __m256 aVal, cVal, x2, a, b;
384 __m256 const1, const2, const3, const4, const5, const6;
385 const1 = _mm256_set1_ps(135135.0f);
386 const2 = _mm256_set1_ps(17325.0f);
387 const3 = _mm256_set1_ps(378.0f);
388 const4 = _mm256_set1_ps(62370.0f);
389 const5 = _mm256_set1_ps(3150.0f);
390 const6 = _mm256_set1_ps(28.0f);
391 for (; number < eighthPoints; number++) {
393 aVal = _mm256_loadu_ps(aPtr);
394 x2 = _mm256_mul_ps(aVal, aVal);
398 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
400 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
402 cVal = _mm256_div_ps(a, b);
404 _mm256_storeu_ps(cPtr, cVal);
410 number = eighthPoints * 8;
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_tanh_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:321
static void volk_32f_tanh_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:67
static void volk_32f_tanh_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:160
static void volk_32f_tanh_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:109
static void volk_32f_tanh_32f_series(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:83
static void volk_32f_tanh_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:270