79 #ifndef INCLUDED_volk_32f_log2_32f_a_H
80 #define INCLUDED_volk_32f_log2_32f_a_H
87 #define LOG_POLY_DEGREE 6
89 #ifdef LV_HAVE_GENERIC
94 float* bPtr = bVector;
95 const float* aPtr = aVector;
96 unsigned int number = 0;
98 for (number = 0; number < num_points; number++)
103 #if LV_HAVE_AVX2 && LV_HAVE_FMA
104 #include <immintrin.h>
106 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
107 #define POLY1_FMAAVX2(x, c0, c1) \
108 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
109 #define POLY2_FMAAVX2(x, c0, c1, c2) \
110 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
111 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
112 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
113 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
114 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
115 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
116 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
118 static inline void volk_32f_log2_32f_a_avx2_fma(
float* bVector,
119 const float* aVector,
120 unsigned int num_points)
122 float* bPtr = bVector;
123 const float* aPtr = aVector;
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
128 __m256 aVal, bVal, mantissa, frac, leadingOne;
131 for (; number < eighthPoints; number++) {
133 aVal = _mm256_load_ps(aPtr);
134 bias = _mm256_set1_epi32(127);
135 leadingOne = _mm256_set1_ps(1.0f);
136 exp = _mm256_sub_epi32(
137 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138 _mm256_set1_epi32(0x7f800000)),
141 bVal = _mm256_cvtepi32_ps(exp);
146 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
148 #if LOG_POLY_DEGREE == 6
149 mantissa = POLY5_FMAAVX2(frac,
156 #elif LOG_POLY_DEGREE == 5
157 mantissa = POLY4_FMAAVX2(frac,
158 2.8882704548164776201f,
159 -2.52074962577807006663f,
160 1.48116647521213171641f,
161 -0.465725644288844778798f,
162 0.0596515482674574969533f);
163 #elif LOG_POLY_DEGREE == 4
164 mantissa = POLY3_FMAAVX2(frac,
165 2.61761038894603480148f,
166 -1.75647175389045657003f,
167 0.688243882994381274313f,
168 -0.107254423828329604454f);
169 #elif LOG_POLY_DEGREE == 3
170 mantissa = POLY2_FMAAVX2(frac,
171 2.28330284476918490682f,
172 -1.04913055217340124191f,
173 0.204446009836232697516f);
178 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
179 _mm256_store_ps(bPtr, bVal);
185 number = eighthPoints * 8;
192 #include <immintrin.h>
194 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
195 #define POLY1_AVX2(x, c0, c1) \
196 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
197 #define POLY2_AVX2(x, c0, c1, c2) \
198 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
199 #define POLY3_AVX2(x, c0, c1, c2, c3) \
200 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
201 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
202 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
203 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
204 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
207 volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
209 float* bPtr = bVector;
210 const float* aPtr = aVector;
212 unsigned int number = 0;
213 const unsigned int eighthPoints = num_points / 8;
215 __m256 aVal, bVal, mantissa, frac, leadingOne;
218 for (; number < eighthPoints; number++) {
220 aVal = _mm256_load_ps(aPtr);
221 bias = _mm256_set1_epi32(127);
222 leadingOne = _mm256_set1_ps(1.0f);
223 exp = _mm256_sub_epi32(
224 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
225 _mm256_set1_epi32(0x7f800000)),
228 bVal = _mm256_cvtepi32_ps(exp);
233 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
235 #if LOG_POLY_DEGREE == 6
236 mantissa = POLY5_AVX2(frac,
243 #elif LOG_POLY_DEGREE == 5
244 mantissa = POLY4_AVX2(frac,
245 2.8882704548164776201f,
246 -2.52074962577807006663f,
247 1.48116647521213171641f,
248 -0.465725644288844778798f,
249 0.0596515482674574969533f);
250 #elif LOG_POLY_DEGREE == 4
251 mantissa = POLY3_AVX2(frac,
252 2.61761038894603480148f,
253 -1.75647175389045657003f,
254 0.688243882994381274313f,
255 -0.107254423828329604454f);
256 #elif LOG_POLY_DEGREE == 3
257 mantissa = POLY2_AVX2(frac,
258 2.28330284476918490682f,
259 -1.04913055217340124191f,
260 0.204446009836232697516f);
266 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
267 _mm256_store_ps(bPtr, bVal);
273 number = eighthPoints * 8;
279 #ifdef LV_HAVE_SSE4_1
280 #include <smmintrin.h>
282 #define POLY0(x, c0) _mm_set1_ps(c0)
283 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
284 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
285 #define POLY3(x, c0, c1, c2, c3) \
286 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
287 #define POLY4(x, c0, c1, c2, c3, c4) \
288 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
289 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
290 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
293 volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
295 float* bPtr = bVector;
296 const float* aPtr = aVector;
298 unsigned int number = 0;
299 const unsigned int quarterPoints = num_points / 4;
301 __m128 aVal, bVal, mantissa, frac, leadingOne;
304 for (; number < quarterPoints; number++) {
319 #if LOG_POLY_DEGREE == 6
320 mantissa = POLY5(frac,
327 #elif LOG_POLY_DEGREE == 5
328 mantissa = POLY4(frac,
329 2.8882704548164776201f,
330 -2.52074962577807006663f,
331 1.48116647521213171641f,
332 -0.465725644288844778798f,
333 0.0596515482674574969533f);
334 #elif LOG_POLY_DEGREE == 4
335 mantissa = POLY3(frac,
336 2.61761038894603480148f,
337 -1.75647175389045657003f,
338 0.688243882994381274313f,
339 -0.107254423828329604454f);
340 #elif LOG_POLY_DEGREE == 3
341 mantissa = POLY2(frac,
342 2.28330284476918490682f,
343 -1.04913055217340124191f,
344 0.204446009836232697516f);
356 number = quarterPoints * 4;
363 #include <arm_neon.h>
366 #define VLOG2Q_NEON_PREAMBLE() \
367 int32x4_t one = vdupq_n_s32(0x000800000); \
369 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
370 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
371 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
372 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
373 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
374 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
375 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
376 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
377 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
378 int32x4_t exp_bias = vdupq_n_s32(127);
381 #define VLOG2Q_NEON_F32(log2_approx, aval) \
382 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
383 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
384 exponent_i = vshrq_n_s32(exponent_i, 23); \
389 significand_i = vorrq_s32(one, significand_i); \
390 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
392 exponent_i = vsubq_s32(exponent_i, exp_bias); \
393 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
397 log2_approx = vaddq_f32(exponent_f, p0); \
398 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
399 log2_approx = vaddq_f32(log2_approx, tmp1); \
400 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
401 tmp1 = vmulq_f32(sig_2, p2); \
402 log2_approx = vaddq_f32(log2_approx, tmp1); \
404 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
405 tmp1 = vmulq_f32(sig_3, p3); \
406 log2_approx = vaddq_f32(log2_approx, tmp1); \
407 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
408 tmp1 = vmulq_f32(sig_4, p4); \
409 log2_approx = vaddq_f32(log2_approx, tmp1); \
410 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
411 tmp1 = vmulq_f32(sig_5, p5); \
412 log2_approx = vaddq_f32(log2_approx, tmp1); \
413 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
414 tmp1 = vmulq_f32(sig_6, p6); \
415 log2_approx = vaddq_f32(log2_approx, tmp1);
420 float* bPtr = bVector;
421 const float* aPtr = aVector;
423 const unsigned int quarterPoints = num_points / 4;
426 float32x4_t log2_approx;
437 for (number = 0; number < quarterPoints; ++number) {
439 aval = vld1q_s32((
int*)aPtr);
443 vst1q_f32(bPtr, log2_approx);
449 number = quarterPoints * 4;
458 #ifndef INCLUDED_volk_32f_log2_32f_u_H
459 #define INCLUDED_volk_32f_log2_32f_u_H
462 #ifdef LV_HAVE_GENERIC
467 float* bPtr = bVector;
468 const float* aPtr = aVector;
469 unsigned int number = 0;
471 for (number = 0; number < num_points; number++) {
472 float const result = log2f(*aPtr++);
473 *bPtr++ = isinf(result) ? -127.0f : result;
480 #ifdef LV_HAVE_SSE4_1
481 #include <smmintrin.h>
483 #define POLY0(x, c0) _mm_set1_ps(c0)
484 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
485 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
486 #define POLY3(x, c0, c1, c2, c3) \
487 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
488 #define POLY4(x, c0, c1, c2, c3, c4) \
489 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
490 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
491 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
494 volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
496 float* bPtr = bVector;
497 const float* aPtr = aVector;
499 unsigned int number = 0;
500 const unsigned int quarterPoints = num_points / 4;
502 __m128 aVal, bVal, mantissa, frac, leadingOne;
505 for (; number < quarterPoints; number++) {
520 #if LOG_POLY_DEGREE == 6
521 mantissa = POLY5(frac,
528 #elif LOG_POLY_DEGREE == 5
529 mantissa = POLY4(frac,
530 2.8882704548164776201f,
531 -2.52074962577807006663f,
532 1.48116647521213171641f,
533 -0.465725644288844778798f,
534 0.0596515482674574969533f);
535 #elif LOG_POLY_DEGREE == 4
536 mantissa = POLY3(frac,
537 2.61761038894603480148f,
538 -1.75647175389045657003f,
539 0.688243882994381274313f,
540 -0.107254423828329604454f);
541 #elif LOG_POLY_DEGREE == 3
542 mantissa = POLY2(frac,
543 2.28330284476918490682f,
544 -1.04913055217340124191f,
545 0.204446009836232697516f);
557 number = quarterPoints * 4;
563 #if LV_HAVE_AVX2 && LV_HAVE_FMA
564 #include <immintrin.h>
566 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
567 #define POLY1_FMAAVX2(x, c0, c1) \
568 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
569 #define POLY2_FMAAVX2(x, c0, c1, c2) \
570 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
571 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
572 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
573 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
574 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
575 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
576 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
578 static inline void volk_32f_log2_32f_u_avx2_fma(
float* bVector,
579 const float* aVector,
580 unsigned int num_points)
582 float* bPtr = bVector;
583 const float* aPtr = aVector;
585 unsigned int number = 0;
586 const unsigned int eighthPoints = num_points / 8;
588 __m256 aVal, bVal, mantissa, frac, leadingOne;
591 for (; number < eighthPoints; number++) {
593 aVal = _mm256_loadu_ps(aPtr);
594 bias = _mm256_set1_epi32(127);
595 leadingOne = _mm256_set1_ps(1.0f);
596 exp = _mm256_sub_epi32(
597 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
598 _mm256_set1_epi32(0x7f800000)),
601 bVal = _mm256_cvtepi32_ps(exp);
606 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
608 #if LOG_POLY_DEGREE == 6
609 mantissa = POLY5_FMAAVX2(frac,
616 #elif LOG_POLY_DEGREE == 5
617 mantissa = POLY4_FMAAVX2(frac,
618 2.8882704548164776201f,
619 -2.52074962577807006663f,
620 1.48116647521213171641f,
621 -0.465725644288844778798f,
622 0.0596515482674574969533f);
623 #elif LOG_POLY_DEGREE == 4
624 mantissa = POLY3_FMAAVX2(frac,
625 2.61761038894603480148f,
626 -1.75647175389045657003f,
627 0.688243882994381274313f,
628 -0.107254423828329604454f);
629 #elif LOG_POLY_DEGREE == 3
630 mantissa = POLY2_FMAAVX2(frac,
631 2.28330284476918490682f,
632 -1.04913055217340124191f,
633 0.204446009836232697516f);
638 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
639 _mm256_storeu_ps(bPtr, bVal);
645 number = eighthPoints * 8;
652 #include <immintrin.h>
654 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
655 #define POLY1_AVX2(x, c0, c1) \
656 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
657 #define POLY2_AVX2(x, c0, c1, c2) \
658 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
659 #define POLY3_AVX2(x, c0, c1, c2, c3) \
660 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
661 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
662 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
663 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
664 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
667 volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
669 float* bPtr = bVector;
670 const float* aPtr = aVector;
672 unsigned int number = 0;
673 const unsigned int eighthPoints = num_points / 8;
675 __m256 aVal, bVal, mantissa, frac, leadingOne;
678 for (; number < eighthPoints; number++) {
680 aVal = _mm256_loadu_ps(aPtr);
681 bias = _mm256_set1_epi32(127);
682 leadingOne = _mm256_set1_ps(1.0f);
683 exp = _mm256_sub_epi32(
684 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
685 _mm256_set1_epi32(0x7f800000)),
688 bVal = _mm256_cvtepi32_ps(exp);
693 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
695 #if LOG_POLY_DEGREE == 6
696 mantissa = POLY5_AVX2(frac,
703 #elif LOG_POLY_DEGREE == 5
704 mantissa = POLY4_AVX2(frac,
705 2.8882704548164776201f,
706 -2.52074962577807006663f,
707 1.48116647521213171641f,
708 -0.465725644288844778798f,
709 0.0596515482674574969533f);
710 #elif LOG_POLY_DEGREE == 4
711 mantissa = POLY3_AVX2(frac,
712 2.61761038894603480148f,
713 -1.75647175389045657003f,
714 0.688243882994381274313f,
715 -0.107254423828329604454f);
716 #elif LOG_POLY_DEGREE == 3
717 mantissa = POLY2_AVX2(frac,
718 2.28330284476918490682f,
719 -1.04913055217340124191f,
720 0.204446009836232697516f);
726 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
727 _mm256_storeu_ps(bPtr, bVal);
733 number = eighthPoints * 8;
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6087
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3230
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:462
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:415
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:92
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:381
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:366
static float log2f_non_ieee(float f)
Definition: volk_common.h:159