Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
79 #ifndef INCLUDED_volk_32f_log2_32f_a_H
80 #define INCLUDED_volk_32f_log2_32f_a_H
81 
82 #include <inttypes.h>
83 #include <math.h>
84 #include <stdio.h>
85 #include <stdlib.h>
86 
87 #define LOG_POLY_DEGREE 6
88 
89 #ifdef LV_HAVE_GENERIC
90 
91 static inline void
92 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
93 {
94  float* bPtr = bVector;
95  const float* aPtr = aVector;
96  unsigned int number = 0;
97 
98  for (number = 0; number < num_points; number++)
99  *bPtr++ = log2f_non_ieee(*aPtr++);
100 }
101 #endif /* LV_HAVE_GENERIC */
102 
103 #if LV_HAVE_AVX2 && LV_HAVE_FMA
104 #include <immintrin.h>
105 
106 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
107 #define POLY1_FMAAVX2(x, c0, c1) \
108  _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
109 #define POLY2_FMAAVX2(x, c0, c1, c2) \
110  _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
111 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
112  _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
113 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
114  _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
115 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
116  _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
117 
118 static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
119  const float* aVector,
120  unsigned int num_points)
121 {
122  float* bPtr = bVector;
123  const float* aPtr = aVector;
124 
125  unsigned int number = 0;
126  const unsigned int eighthPoints = num_points / 8;
127 
128  __m256 aVal, bVal, mantissa, frac, leadingOne;
129  __m256i bias, exp;
130 
131  for (; number < eighthPoints; number++) {
132 
133  aVal = _mm256_load_ps(aPtr);
134  bias = _mm256_set1_epi32(127);
135  leadingOne = _mm256_set1_ps(1.0f);
136  exp = _mm256_sub_epi32(
137  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138  _mm256_set1_epi32(0x7f800000)),
139  23),
140  bias);
141  bVal = _mm256_cvtepi32_ps(exp);
142 
143  // Now to extract mantissa
144  frac = _mm256_or_ps(
145  leadingOne,
146  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
147 
148 #if LOG_POLY_DEGREE == 6
149  mantissa = POLY5_FMAAVX2(frac,
150  3.1157899f,
151  -3.3241990f,
152  2.5988452f,
153  -1.2315303f,
154  3.1821337e-1f,
155  -3.4436006e-2f);
156 #elif LOG_POLY_DEGREE == 5
157  mantissa = POLY4_FMAAVX2(frac,
158  2.8882704548164776201f,
159  -2.52074962577807006663f,
160  1.48116647521213171641f,
161  -0.465725644288844778798f,
162  0.0596515482674574969533f);
163 #elif LOG_POLY_DEGREE == 4
164  mantissa = POLY3_FMAAVX2(frac,
165  2.61761038894603480148f,
166  -1.75647175389045657003f,
167  0.688243882994381274313f,
168  -0.107254423828329604454f);
169 #elif LOG_POLY_DEGREE == 3
170  mantissa = POLY2_FMAAVX2(frac,
171  2.28330284476918490682f,
172  -1.04913055217340124191f,
173  0.204446009836232697516f);
174 #else
175 #error
176 #endif
177 
178  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
179  _mm256_store_ps(bPtr, bVal);
180 
181  aPtr += 8;
182  bPtr += 8;
183  }
184 
185  number = eighthPoints * 8;
186  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
187 }
188 
189 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
190 
191 #ifdef LV_HAVE_AVX2
192 #include <immintrin.h>
193 
194 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
195 #define POLY1_AVX2(x, c0, c1) \
196  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
197 #define POLY2_AVX2(x, c0, c1, c2) \
198  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
199 #define POLY3_AVX2(x, c0, c1, c2, c3) \
200  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
201 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
202  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
203 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
204  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
205 
206 static inline void
207 volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
208 {
209  float* bPtr = bVector;
210  const float* aPtr = aVector;
211 
212  unsigned int number = 0;
213  const unsigned int eighthPoints = num_points / 8;
214 
215  __m256 aVal, bVal, mantissa, frac, leadingOne;
216  __m256i bias, exp;
217 
218  for (; number < eighthPoints; number++) {
219 
220  aVal = _mm256_load_ps(aPtr);
221  bias = _mm256_set1_epi32(127);
222  leadingOne = _mm256_set1_ps(1.0f);
223  exp = _mm256_sub_epi32(
224  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
225  _mm256_set1_epi32(0x7f800000)),
226  23),
227  bias);
228  bVal = _mm256_cvtepi32_ps(exp);
229 
230  // Now to extract mantissa
231  frac = _mm256_or_ps(
232  leadingOne,
233  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
234 
235 #if LOG_POLY_DEGREE == 6
236  mantissa = POLY5_AVX2(frac,
237  3.1157899f,
238  -3.3241990f,
239  2.5988452f,
240  -1.2315303f,
241  3.1821337e-1f,
242  -3.4436006e-2f);
243 #elif LOG_POLY_DEGREE == 5
244  mantissa = POLY4_AVX2(frac,
245  2.8882704548164776201f,
246  -2.52074962577807006663f,
247  1.48116647521213171641f,
248  -0.465725644288844778798f,
249  0.0596515482674574969533f);
250 #elif LOG_POLY_DEGREE == 4
251  mantissa = POLY3_AVX2(frac,
252  2.61761038894603480148f,
253  -1.75647175389045657003f,
254  0.688243882994381274313f,
255  -0.107254423828329604454f);
256 #elif LOG_POLY_DEGREE == 3
257  mantissa = POLY2_AVX2(frac,
258  2.28330284476918490682f,
259  -1.04913055217340124191f,
260  0.204446009836232697516f);
261 #else
262 #error
263 #endif
264 
265  bVal =
266  _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
267  _mm256_store_ps(bPtr, bVal);
268 
269  aPtr += 8;
270  bPtr += 8;
271  }
272 
273  number = eighthPoints * 8;
274  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
275 }
276 
277 #endif /* LV_HAVE_AVX2 for aligned */
278 
279 #ifdef LV_HAVE_SSE4_1
280 #include <smmintrin.h>
281 
282 #define POLY0(x, c0) _mm_set1_ps(c0)
283 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
284 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
285 #define POLY3(x, c0, c1, c2, c3) \
286  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
287 #define POLY4(x, c0, c1, c2, c3, c4) \
288  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
289 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
290  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
291 
292 static inline void
293 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
294 {
295  float* bPtr = bVector;
296  const float* aPtr = aVector;
297 
298  unsigned int number = 0;
299  const unsigned int quarterPoints = num_points / 4;
300 
301  __m128 aVal, bVal, mantissa, frac, leadingOne;
302  __m128i bias, exp;
303 
304  for (; number < quarterPoints; number++) {
305 
306  aVal = _mm_load_ps(aPtr);
307  bias = _mm_set1_epi32(127);
308  leadingOne = _mm_set1_ps(1.0f);
309  exp = _mm_sub_epi32(
311  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
312  bias);
313  bVal = _mm_cvtepi32_ps(exp);
314 
315  // Now to extract mantissa
316  frac = _mm_or_ps(leadingOne,
317  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
318 
319 #if LOG_POLY_DEGREE == 6
320  mantissa = POLY5(frac,
321  3.1157899f,
322  -3.3241990f,
323  2.5988452f,
324  -1.2315303f,
325  3.1821337e-1f,
326  -3.4436006e-2f);
327 #elif LOG_POLY_DEGREE == 5
328  mantissa = POLY4(frac,
329  2.8882704548164776201f,
330  -2.52074962577807006663f,
331  1.48116647521213171641f,
332  -0.465725644288844778798f,
333  0.0596515482674574969533f);
334 #elif LOG_POLY_DEGREE == 4
335  mantissa = POLY3(frac,
336  2.61761038894603480148f,
337  -1.75647175389045657003f,
338  0.688243882994381274313f,
339  -0.107254423828329604454f);
340 #elif LOG_POLY_DEGREE == 3
341  mantissa = POLY2(frac,
342  2.28330284476918490682f,
343  -1.04913055217340124191f,
344  0.204446009836232697516f);
345 #else
346 #error
347 #endif
348 
349  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
350  _mm_store_ps(bPtr, bVal);
351 
352  aPtr += 4;
353  bPtr += 4;
354  }
355 
356  number = quarterPoints * 4;
357  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
358 }
359 
360 #endif /* LV_HAVE_SSE4_1 for aligned */
361 
362 #ifdef LV_HAVE_NEON
363 #include <arm_neon.h>
364 
365 /* these macros allow us to embed logs in other kernels */
366 #define VLOG2Q_NEON_PREAMBLE() \
367  int32x4_t one = vdupq_n_s32(0x000800000); \
368  /* minimax polynomial */ \
369  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
370  float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
371  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
372  float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
373  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
374  float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
375  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
376  int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
377  int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
378  int32x4_t exp_bias = vdupq_n_s32(127);
379 
380 
381 #define VLOG2Q_NEON_F32(log2_approx, aval) \
382  int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
383  int32x4_t significand_i = vandq_s32(aval, sig_mask); \
384  exponent_i = vshrq_n_s32(exponent_i, 23); \
385  \
386  /* extract the exponent and significand \
387  we can treat this as fixed point to save ~9% on the \
388  conversion + float add */ \
389  significand_i = vorrq_s32(one, significand_i); \
390  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
391  /* debias the exponent and convert to float */ \
392  exponent_i = vsubq_s32(exponent_i, exp_bias); \
393  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
394  \
395  /* put the significand through a polynomial fit of log2(x) [1,2] \
396  add the result to the exponent */ \
397  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
398  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
399  log2_approx = vaddq_f32(log2_approx, tmp1); \
400  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
401  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
402  log2_approx = vaddq_f32(log2_approx, tmp1); \
403  \
404  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
405  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
406  log2_approx = vaddq_f32(log2_approx, tmp1); \
407  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
408  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
409  log2_approx = vaddq_f32(log2_approx, tmp1); \
410  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
411  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
412  log2_approx = vaddq_f32(log2_approx, tmp1); \
413  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
414  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
415  log2_approx = vaddq_f32(log2_approx, tmp1);
416 
417 static inline void
418 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
419 {
420  float* bPtr = bVector;
421  const float* aPtr = aVector;
422  unsigned int number;
423  const unsigned int quarterPoints = num_points / 4;
424 
425  int32x4_t aval;
426  float32x4_t log2_approx;
427 
429  // lms
430  // p0 = vdupq_n_f32(-1.649132280361871);
431  // p1 = vdupq_n_f32(1.995047138579499);
432  // p2 = vdupq_n_f32(-0.336914839219728);
433 
434  // keep in mind a single precision float is represented as
435  // (-1)^sign * 2^exp * 1.significand, so the log2 is
436  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
437  for (number = 0; number < quarterPoints; ++number) {
438  // load float in to an int register without conversion
439  aval = vld1q_s32((int*)aPtr);
440 
441  VLOG2Q_NEON_F32(log2_approx, aval)
442 
443  vst1q_f32(bPtr, log2_approx);
444 
445  aPtr += 4;
446  bPtr += 4;
447  }
448 
449  number = quarterPoints * 4;
450  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
451 }
452 
453 #endif /* LV_HAVE_NEON */
454 
455 
456 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
457 
458 #ifndef INCLUDED_volk_32f_log2_32f_u_H
459 #define INCLUDED_volk_32f_log2_32f_u_H
460 
461 
462 #ifdef LV_HAVE_GENERIC
463 
464 static inline void
465 volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
466 {
467  float* bPtr = bVector;
468  const float* aPtr = aVector;
469  unsigned int number = 0;
470 
471  for (number = 0; number < num_points; number++) {
472  float const result = log2f(*aPtr++);
473  *bPtr++ = isinf(result) ? -127.0f : result;
474  }
475 }
476 
477 #endif /* LV_HAVE_GENERIC */
478 
479 
480 #ifdef LV_HAVE_SSE4_1
481 #include <smmintrin.h>
482 
483 #define POLY0(x, c0) _mm_set1_ps(c0)
484 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
485 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
486 #define POLY3(x, c0, c1, c2, c3) \
487  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
488 #define POLY4(x, c0, c1, c2, c3, c4) \
489  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
490 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
491  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
492 
493 static inline void
494 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
495 {
496  float* bPtr = bVector;
497  const float* aPtr = aVector;
498 
499  unsigned int number = 0;
500  const unsigned int quarterPoints = num_points / 4;
501 
502  __m128 aVal, bVal, mantissa, frac, leadingOne;
503  __m128i bias, exp;
504 
505  for (; number < quarterPoints; number++) {
506 
507  aVal = _mm_loadu_ps(aPtr);
508  bias = _mm_set1_epi32(127);
509  leadingOne = _mm_set1_ps(1.0f);
510  exp = _mm_sub_epi32(
512  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
513  bias);
514  bVal = _mm_cvtepi32_ps(exp);
515 
516  // Now to extract mantissa
517  frac = _mm_or_ps(leadingOne,
518  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
519 
520 #if LOG_POLY_DEGREE == 6
521  mantissa = POLY5(frac,
522  3.1157899f,
523  -3.3241990f,
524  2.5988452f,
525  -1.2315303f,
526  3.1821337e-1f,
527  -3.4436006e-2f);
528 #elif LOG_POLY_DEGREE == 5
529  mantissa = POLY4(frac,
530  2.8882704548164776201f,
531  -2.52074962577807006663f,
532  1.48116647521213171641f,
533  -0.465725644288844778798f,
534  0.0596515482674574969533f);
535 #elif LOG_POLY_DEGREE == 4
536  mantissa = POLY3(frac,
537  2.61761038894603480148f,
538  -1.75647175389045657003f,
539  0.688243882994381274313f,
540  -0.107254423828329604454f);
541 #elif LOG_POLY_DEGREE == 3
542  mantissa = POLY2(frac,
543  2.28330284476918490682f,
544  -1.04913055217340124191f,
545  0.204446009836232697516f);
546 #else
547 #error
548 #endif
549 
550  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
551  _mm_storeu_ps(bPtr, bVal);
552 
553  aPtr += 4;
554  bPtr += 4;
555  }
556 
557  number = quarterPoints * 4;
558  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
559 }
560 
561 #endif /* LV_HAVE_SSE4_1 for unaligned */
562 
563 #if LV_HAVE_AVX2 && LV_HAVE_FMA
564 #include <immintrin.h>
565 
566 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
567 #define POLY1_FMAAVX2(x, c0, c1) \
568  _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
569 #define POLY2_FMAAVX2(x, c0, c1, c2) \
570  _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
571 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
572  _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
573 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
574  _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
575 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
576  _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
577 
578 static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
579  const float* aVector,
580  unsigned int num_points)
581 {
582  float* bPtr = bVector;
583  const float* aPtr = aVector;
584 
585  unsigned int number = 0;
586  const unsigned int eighthPoints = num_points / 8;
587 
588  __m256 aVal, bVal, mantissa, frac, leadingOne;
589  __m256i bias, exp;
590 
591  for (; number < eighthPoints; number++) {
592 
593  aVal = _mm256_loadu_ps(aPtr);
594  bias = _mm256_set1_epi32(127);
595  leadingOne = _mm256_set1_ps(1.0f);
596  exp = _mm256_sub_epi32(
597  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
598  _mm256_set1_epi32(0x7f800000)),
599  23),
600  bias);
601  bVal = _mm256_cvtepi32_ps(exp);
602 
603  // Now to extract mantissa
604  frac = _mm256_or_ps(
605  leadingOne,
606  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
607 
608 #if LOG_POLY_DEGREE == 6
609  mantissa = POLY5_FMAAVX2(frac,
610  3.1157899f,
611  -3.3241990f,
612  2.5988452f,
613  -1.2315303f,
614  3.1821337e-1f,
615  -3.4436006e-2f);
616 #elif LOG_POLY_DEGREE == 5
617  mantissa = POLY4_FMAAVX2(frac,
618  2.8882704548164776201f,
619  -2.52074962577807006663f,
620  1.48116647521213171641f,
621  -0.465725644288844778798f,
622  0.0596515482674574969533f);
623 #elif LOG_POLY_DEGREE == 4
624  mantissa = POLY3_FMAAVX2(frac,
625  2.61761038894603480148f,
626  -1.75647175389045657003f,
627  0.688243882994381274313f,
628  -0.107254423828329604454f);
629 #elif LOG_POLY_DEGREE == 3
630  mantissa = POLY2_FMAAVX2(frac,
631  2.28330284476918490682f,
632  -1.04913055217340124191f,
633  0.204446009836232697516f);
634 #else
635 #error
636 #endif
637 
638  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
639  _mm256_storeu_ps(bPtr, bVal);
640 
641  aPtr += 8;
642  bPtr += 8;
643  }
644 
645  number = eighthPoints * 8;
646  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
647 }
648 
649 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
650 
651 #ifdef LV_HAVE_AVX2
652 #include <immintrin.h>
653 
654 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
655 #define POLY1_AVX2(x, c0, c1) \
656  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
657 #define POLY2_AVX2(x, c0, c1, c2) \
658  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
659 #define POLY3_AVX2(x, c0, c1, c2, c3) \
660  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
661 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
662  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
663 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
664  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
665 
666 static inline void
667 volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
668 {
669  float* bPtr = bVector;
670  const float* aPtr = aVector;
671 
672  unsigned int number = 0;
673  const unsigned int eighthPoints = num_points / 8;
674 
675  __m256 aVal, bVal, mantissa, frac, leadingOne;
676  __m256i bias, exp;
677 
678  for (; number < eighthPoints; number++) {
679 
680  aVal = _mm256_loadu_ps(aPtr);
681  bias = _mm256_set1_epi32(127);
682  leadingOne = _mm256_set1_ps(1.0f);
683  exp = _mm256_sub_epi32(
684  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
685  _mm256_set1_epi32(0x7f800000)),
686  23),
687  bias);
688  bVal = _mm256_cvtepi32_ps(exp);
689 
690  // Now to extract mantissa
691  frac = _mm256_or_ps(
692  leadingOne,
693  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
694 
695 #if LOG_POLY_DEGREE == 6
696  mantissa = POLY5_AVX2(frac,
697  3.1157899f,
698  -3.3241990f,
699  2.5988452f,
700  -1.2315303f,
701  3.1821337e-1f,
702  -3.4436006e-2f);
703 #elif LOG_POLY_DEGREE == 5
704  mantissa = POLY4_AVX2(frac,
705  2.8882704548164776201f,
706  -2.52074962577807006663f,
707  1.48116647521213171641f,
708  -0.465725644288844778798f,
709  0.0596515482674574969533f);
710 #elif LOG_POLY_DEGREE == 4
711  mantissa = POLY3_AVX2(frac,
712  2.61761038894603480148f,
713  -1.75647175389045657003f,
714  0.688243882994381274313f,
715  -0.107254423828329604454f);
716 #elif LOG_POLY_DEGREE == 3
717  mantissa = POLY2_AVX2(frac,
718  2.28330284476918490682f,
719  -1.04913055217340124191f,
720  0.204446009836232697516f);
721 #else
722 #error
723 #endif
724 
725  bVal =
726  _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
727  _mm256_storeu_ps(bPtr, bVal);
728 
729  aPtr += 8;
730  bPtr += 8;
731  }
732 
733  number = eighthPoints * 8;
734  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
735 }
736 
737 #endif /* LV_HAVE_AVX2 for unaligned */
738 
739 
740 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6087
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
Definition: sse2neon.h:3250
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128i _mm_castps_si128(__m128)
Definition: sse2neon.h:3230
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:462
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:415
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:92
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:381
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:366
static float log2f_non_ieee(float f)
Definition: volk_common.h:159