71 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
72 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
79 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
83 #include <pmmintrin.h>
84 #include <xmmintrin.h>
88 float* center_point_array,
90 unsigned int num_points)
98 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
108 int bound = num_points / 8;
109 int leftovers = num_points - 8 * bound;
111 for (;
i < bound; ++
i) {
157 for (
i = 0;
i < leftovers; ++
i) {
159 fst =
MAX(fst, *cutoff);
163 result += (center_point_array[0] * fst + center_point_array[1] * sq +
164 center_point_array[2] * thrd + center_point_array[3] * frth);
167 result += (float)(num_points)*center_point_array[4];
174 #if LV_HAVE_AVX && LV_HAVE_FMA
175 #include <immintrin.h>
177 static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(
float* target,
179 float* center_point_array,
181 unsigned int num_points)
183 const unsigned int eighth_points = num_points / 8;
189 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
191 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
193 cpa0 = _mm256_set1_ps(center_point_array[0]);
194 cpa1 = _mm256_set1_ps(center_point_array[1]);
195 cpa2 = _mm256_set1_ps(center_point_array[2]);
196 cpa3 = _mm256_set1_ps(center_point_array[3]);
197 cutoff_vec = _mm256_set1_ps(*cutoff);
198 target_vec = _mm256_setzero_ps();
202 for (
i = 0;
i < eighth_points; ++
i) {
203 x_to_1 = _mm256_load_ps(src0);
204 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
205 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
206 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
208 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
210 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
211 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
213 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
214 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
216 target_vec = _mm256_add_ps(x_to_1, target_vec);
217 target_vec = _mm256_add_ps(x_to_3, target_vec);
224 target_vec = _mm256_hadd_ps(
227 _mm256_store_ps(temp_results, target_vec);
228 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
230 for (
i = eighth_points * 8;
i < num_points; ++
i) {
232 fst =
MAX(fst, *cutoff);
236 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
237 center_point_array[2] * thrd + center_point_array[3] * frth);
239 *target += (float)(num_points)*center_point_array[4];
244 #include <immintrin.h>
248 float* center_point_array,
250 unsigned int num_points)
252 const unsigned int eighth_points = num_points / 8;
258 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
260 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
262 cpa0 = _mm256_set1_ps(center_point_array[0]);
263 cpa1 = _mm256_set1_ps(center_point_array[1]);
264 cpa2 = _mm256_set1_ps(center_point_array[2]);
265 cpa3 = _mm256_set1_ps(center_point_array[3]);
266 cutoff_vec = _mm256_set1_ps(*cutoff);
267 target_vec = _mm256_setzero_ps();
271 for (
i = 0;
i < eighth_points; ++
i) {
272 x_to_1 = _mm256_load_ps(src0);
273 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
274 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
275 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
277 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
279 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
280 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
281 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
282 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
284 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
285 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
287 target_vec = _mm256_add_ps(x_to_1, target_vec);
288 target_vec = _mm256_add_ps(x_to_3, target_vec);
295 target_vec = _mm256_hadd_ps(
298 _mm256_store_ps(temp_results, target_vec);
299 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
301 for (
i = eighth_points * 8;
i < num_points; ++
i) {
303 fst =
MAX(fst, *cutoff);
307 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
308 center_point_array[2] * thrd + center_point_array[3] * frth);
310 *target += (float)(num_points)*center_point_array[4];
315 #ifdef LV_HAVE_GENERIC
319 float* center_point_array,
321 unsigned int num_points)
323 const unsigned int eighth_points = num_points / 8;
325 float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
333 for (
i = 0;
i < eighth_points; ++
i) {
334 for (k = 0; k < 8; ++k) {
336 fst =
MAX(fst, *cutoff);
340 result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
341 result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
344 for (k = 0; k < 8; k += 2)
345 result[k] = result[k] + result[k + 1];
347 *target = result[0] + result[2] + result[4] + result[6];
349 for (
i = eighth_points * 8;
i < num_points; ++
i) {
351 fst =
MAX(fst, *cutoff);
355 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
356 center_point_array[2] * thrd + center_point_array[3] * frth);
358 *target += (float)(num_points)*center_point_array[4];
364 #include <arm_neon.h>
368 float* __restrict src0,
369 float* __restrict center_point_array,
370 float* __restrict cutoff,
371 unsigned int num_points)
374 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
376 float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
377 float32x2_t cutoff_vector;
378 float32x2x2_t x_low, x_high;
379 float32x4_t x_qvector, c_qvector, cpa_qvector;
381 float res_accumulators[4];
383 c_qvector = vld1q_f32(zero);
385 cutoff_vector = vdup_n_f32(*cutoff);
387 cpa_qvector = vld1q_f32(center_point_array);
389 for (
i = 0;
i < num_points; ++
i) {
391 x_to_1 = vdup_n_f32(*src0++);
394 x_to_1 = vmax_f32(x_to_1, cutoff_vector);
395 x_to_2 = vmul_f32(x_to_1, x_to_1);
396 x_to_3 = vmul_f32(x_to_2, x_to_1);
397 x_to_4 = vmul_f32(x_to_3, x_to_1);
399 x_low = vzip_f32(x_to_1, x_to_2);
400 x_high = vzip_f32(x_to_3, x_to_4);
402 x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
405 c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
408 vst1q_f32(res_accumulators, c_qvector);
409 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
412 *target = accumulator + (float)num_points * center_point_array[4];
422 float* __restrict src0,
423 float* __restrict center_point_array,
424 float* __restrict cutoff,
425 unsigned int num_points)
428 float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
432 float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
433 accumulator1_vec = vld1q_f32(zero);
434 accumulator2_vec = vld1q_f32(zero);
435 accumulator3_vec = vld1q_f32(zero);
436 accumulator4_vec = vld1q_f32(zero);
437 float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
438 float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
441 cutoff_vector = vdupq_n_f32(*cutoff);
443 cpa_0 = vdupq_n_f32(center_point_array[0]);
444 cpa_1 = vdupq_n_f32(center_point_array[1]);
445 cpa_2 = vdupq_n_f32(center_point_array[2]);
446 cpa_3 = vdupq_n_f32(center_point_array[3]);
449 for (
i = 0;
i < num_points / 4; ++
i) {
451 x_to_1 = vld1q_f32(src0);
454 x_to_1 = vmaxq_f32(x_to_1, cutoff_vector);
455 x_to_2 = vmulq_f32(x_to_1, x_to_1);
456 x_to_3 = vmulq_f32(x_to_2, x_to_1);
457 x_to_4 = vmulq_f32(x_to_3, x_to_1);
458 x_to_1 = vmulq_f32(x_to_1, cpa_0);
459 x_to_2 = vmulq_f32(x_to_2, cpa_1);
460 x_to_3 = vmulq_f32(x_to_3, cpa_2);
461 x_to_4 = vmulq_f32(x_to_4, cpa_3);
462 accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
463 accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
464 accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
465 accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
469 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
470 accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
471 accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
474 vst1q_f32(res_accumulators, accumulator1_vec);
475 accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
483 for (
i = 4 * (num_points / 4);
i < num_points; ++
i) {
485 fst =
MAX(fst, *cutoff);
492 accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
493 center_point_array[2] * thrd + center_point_array[3] * frth);
496 *target = accumulator + (float)num_points * center_point_array[4];
503 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
504 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
506 #include <inttypes.h>
511 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
514 #if LV_HAVE_AVX && LV_HAVE_FMA
515 #include <immintrin.h>
517 static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(
float* target,
519 float* center_point_array,
521 unsigned int num_points)
523 const unsigned int eighth_points = num_points / 8;
529 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
531 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
533 cpa0 = _mm256_set1_ps(center_point_array[0]);
534 cpa1 = _mm256_set1_ps(center_point_array[1]);
535 cpa2 = _mm256_set1_ps(center_point_array[2]);
536 cpa3 = _mm256_set1_ps(center_point_array[3]);
537 cutoff_vec = _mm256_set1_ps(*cutoff);
538 target_vec = _mm256_setzero_ps();
542 for (
i = 0;
i < eighth_points; ++
i) {
543 x_to_1 = _mm256_loadu_ps(src0);
544 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
545 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
546 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
548 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
550 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
551 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
553 x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
554 x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
556 target_vec = _mm256_add_ps(x_to_1, target_vec);
557 target_vec = _mm256_add_ps(x_to_3, target_vec);
564 target_vec = _mm256_hadd_ps(
567 _mm256_storeu_ps(temp_results, target_vec);
568 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
570 for (
i = eighth_points * 8;
i < num_points; ++
i) {
572 fst =
MAX(fst, *cutoff);
576 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
577 center_point_array[2] * thrd + center_point_array[3] * frth);
580 *target += (float)(num_points)*center_point_array[4];
585 #include <immintrin.h>
589 float* center_point_array,
591 unsigned int num_points)
593 const unsigned int eighth_points = num_points / 8;
599 __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
601 __m256 x_to_1, x_to_2, x_to_3, x_to_4;
603 cpa0 = _mm256_set1_ps(center_point_array[0]);
604 cpa1 = _mm256_set1_ps(center_point_array[1]);
605 cpa2 = _mm256_set1_ps(center_point_array[2]);
606 cpa3 = _mm256_set1_ps(center_point_array[3]);
607 cutoff_vec = _mm256_set1_ps(*cutoff);
608 target_vec = _mm256_setzero_ps();
612 for (
i = 0;
i < eighth_points; ++
i) {
613 x_to_1 = _mm256_loadu_ps(src0);
614 x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
615 x_to_2 = _mm256_mul_ps(x_to_1, x_to_1);
616 x_to_3 = _mm256_mul_ps(x_to_1, x_to_2);
618 x_to_4 = _mm256_mul_ps(x_to_1, x_to_3);
620 x_to_1 = _mm256_mul_ps(x_to_1, cpa0);
621 x_to_2 = _mm256_mul_ps(x_to_2, cpa1);
622 x_to_3 = _mm256_mul_ps(x_to_3, cpa2);
623 x_to_4 = _mm256_mul_ps(x_to_4, cpa3);
625 x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
626 x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
628 target_vec = _mm256_add_ps(x_to_1, target_vec);
629 target_vec = _mm256_add_ps(x_to_3, target_vec);
636 target_vec = _mm256_hadd_ps(
639 _mm256_storeu_ps(temp_results, target_vec);
640 *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
642 for (
i = eighth_points * 8;
i < num_points; ++
i) {
644 fst =
MAX(fst, *cutoff);
649 *target += (center_point_array[0] * fst + center_point_array[1] * sq +
650 center_point_array[2] * thrd + center_point_array[3] * frth);
653 *target += (float)(num_points)*center_point_array[4];
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
Definition: sse2neon.h:2727
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:587
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:86
static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:421
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:367
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:317
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:246
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:79
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
for i
Definition: volk_config_fixed.tmpl.h:13