doxygen/volk__32f__x3__sum__of__poly__32f_8h_source.html

 /* -*- c++ -*- */

 /*

  * Copyright 2012, 2014 Free Software Foundation, Inc.

  *

  * This file is part of VOLK

  *

  * SPDX-License-Identifier: LGPL-3.0-or-later

  */


 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H

 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H


 #include <inttypes.h>

 #include <stdio.h>

 #include <volk/volk_complex.h>


 #ifndef MAX

 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))

 #endif


 #ifdef LV_HAVE_SSE3

 #include <pmmintrin.h>

 #include <xmmintrin.h>


 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,

                                                       float* src0,

                                                       float* center_point_array,

                                                       float* cutoff,

                                                       unsigned int num_points)

 {

     float result = 0.0f;

     float fst = 0.0f;

     float sq = 0.0f;

     float thrd = 0.0f;

     float frth = 0.0f;


     __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;


     xmm9 = _mm_setzero_ps();

     xmm1 = _mm_setzero_ps();

     xmm0 = _mm_load1_ps(&center_point_array[0]);

     xmm6 = _mm_load1_ps(&center_point_array[1]);

     xmm7 = _mm_load1_ps(&center_point_array[2]);

     xmm8 = _mm_load1_ps(&center_point_array[3]);

     xmm10 = _mm_load1_ps(cutoff);


     int bound = num_points / 8;

     int leftovers = num_points - 8 * bound;

     int i = 0;

     for (; i < bound; ++i) {

         // 1st

         xmm2 = _mm_load_ps(src0);

         xmm2 = _mm_max_ps(xmm10, xmm2);

         xmm3 = _mm_mul_ps(xmm2, xmm2);

         xmm4 = _mm_mul_ps(xmm2, xmm3);

         xmm5 = _mm_mul_ps(xmm3, xmm3);


         xmm2 = _mm_mul_ps(xmm2, xmm0);

         xmm3 = _mm_mul_ps(xmm3, xmm6);

         xmm4 = _mm_mul_ps(xmm4, xmm7);

         xmm5 = _mm_mul_ps(xmm5, xmm8);


         xmm2 = _mm_add_ps(xmm2, xmm3);

         xmm3 = _mm_add_ps(xmm4, xmm5);


         src0 += 4;


         xmm9 = _mm_add_ps(xmm2, xmm9);

         xmm9 = _mm_add_ps(xmm3, xmm9);


         // 2nd

         xmm2 = _mm_load_ps(src0);

         xmm2 = _mm_max_ps(xmm10, xmm2);

         xmm3 = _mm_mul_ps(xmm2, xmm2);

         xmm4 = _mm_mul_ps(xmm2, xmm3);

         xmm5 = _mm_mul_ps(xmm3, xmm3);


         xmm2 = _mm_mul_ps(xmm2, xmm0);

         xmm3 = _mm_mul_ps(xmm3, xmm6);

         xmm4 = _mm_mul_ps(xmm4, xmm7);

         xmm5 = _mm_mul_ps(xmm5, xmm8);


         xmm2 = _mm_add_ps(xmm2, xmm3);

         xmm3 = _mm_add_ps(xmm4, xmm5);


         src0 += 4;


         xmm1 = _mm_add_ps(xmm2, xmm1);

         xmm1 = _mm_add_ps(xmm3, xmm1);

     }

     xmm2 = _mm_hadd_ps(xmm9, xmm1);

     xmm3 = _mm_hadd_ps(xmm2, xmm2);

     xmm4 = _mm_hadd_ps(xmm3, xmm3);

     _mm_store_ss(&result, xmm4);


     for (i = 0; i < leftovers; ++i) {

         fst = *src0++;

         fst = MAX(fst, *cutoff);

         sq = fst * fst;

         thrd = fst * sq;

         frth = sq * sq;

         result += (center_point_array[0] * fst + center_point_array[1] * sq +

                    center_point_array[2] * thrd + center_point_array[3] * frth);

     }


     result += (float)(num_points)*center_point_array[4];

     *target = result;

 }


 #endif /*LV_HAVE_SSE3*/


 #if LV_HAVE_AVX && LV_HAVE_FMA

 #include <immintrin.h>


 static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,

                                                           float* src0,

                                                           float* center_point_array,

                                                           float* cutoff,

                                                           unsigned int num_points)

 {

     const unsigned int eighth_points = num_points / 8;

     float fst = 0.0;

     float sq = 0.0;

     float thrd = 0.0;

     float frth = 0.0;


     __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;

     __m256 target_vec;

     __m256 x_to_1, x_to_2, x_to_3, x_to_4;


     cpa0 = _mm256_set1_ps(center_point_array[0]);

     cpa1 = _mm256_set1_ps(center_point_array[1]);

     cpa2 = _mm256_set1_ps(center_point_array[2]);

     cpa3 = _mm256_set1_ps(center_point_array[3]);

     cutoff_vec = _mm256_set1_ps(*cutoff);

     target_vec = _mm256_setzero_ps();


     unsigned int i;


     for (i = 0; i < eighth_points; ++i) {

         x_to_1 = _mm256_load_ps(src0);

         x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);

         x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2

         x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3

         // x^1 * x^3 is slightly faster than x^2 * x^2

         x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4


         x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2

         x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4


         x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);

         x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);

         // this is slightly faster than result += (x_to_1 + x_to_3)

         target_vec = _mm256_add_ps(x_to_1, target_vec);

         target_vec = _mm256_add_ps(x_to_3, target_vec);


         src0 += 8;

     }


     // the hadd for vector reduction has very very slight impact @ 50k iters

     __VOLK_ATTR_ALIGNED(32) float temp_results[8];

     target_vec = _mm256_hadd_ps(

         target_vec,

         target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7

     _mm256_store_ps(temp_results, target_vec);

     *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];


     for (i = eighth_points * 8; i < num_points; ++i) {

         fst = *src0++;

         fst = MAX(fst, *cutoff);

         sq = fst * fst;

         thrd = fst * sq;

         frth = sq * sq;

         *target += (center_point_array[0] * fst + center_point_array[1] * sq +

                     center_point_array[2] * thrd + center_point_array[3] * frth);

     }

     *target += (float)(num_points)*center_point_array[4];

 }

 #endif // LV_HAVE_AVX && LV_HAVE_FMA


 #ifdef LV_HAVE_AVX

 #include <immintrin.h>


 static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,

                                                      float* src0,

                                                      float* center_point_array,

                                                      float* cutoff,

                                                      unsigned int num_points)

 {

     const unsigned int eighth_points = num_points / 8;

     float fst = 0.0;

     float sq = 0.0;

     float thrd = 0.0;

     float frth = 0.0;


     __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;

     __m256 target_vec;

     __m256 x_to_1, x_to_2, x_to_3, x_to_4;


     cpa0 = _mm256_set1_ps(center_point_array[0]);

     cpa1 = _mm256_set1_ps(center_point_array[1]);

     cpa2 = _mm256_set1_ps(center_point_array[2]);

     cpa3 = _mm256_set1_ps(center_point_array[3]);

     cutoff_vec = _mm256_set1_ps(*cutoff);

     target_vec = _mm256_setzero_ps();


     unsigned int i;


     for (i = 0; i < eighth_points; ++i) {

         x_to_1 = _mm256_load_ps(src0);

         x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);

         x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2

         x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3

         // x^1 * x^3 is slightly faster than x^2 * x^2

         x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4


         x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1

         x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2

         x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3

         x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4


         x_to_1 = _mm256_add_ps(x_to_1, x_to_2);

         x_to_3 = _mm256_add_ps(x_to_3, x_to_4);

         // this is slightly faster than result += (x_to_1 + x_to_3)

         target_vec = _mm256_add_ps(x_to_1, target_vec);

         target_vec = _mm256_add_ps(x_to_3, target_vec);


         src0 += 8;

     }


     // the hadd for vector reduction has very very slight impact @ 50k iters

     __VOLK_ATTR_ALIGNED(32) float temp_results[8];

     target_vec = _mm256_hadd_ps(

         target_vec,

         target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7

     _mm256_store_ps(temp_results, target_vec);

     *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];


     for (i = eighth_points * 8; i < num_points; ++i) {

         fst = *src0++;

         fst = MAX(fst, *cutoff);

         sq = fst * fst;

         thrd = fst * sq;

         frth = sq * sq;

         *target += (center_point_array[0] * fst + center_point_array[1] * sq +

                     center_point_array[2] * thrd + center_point_array[3] * frth);

     }

     *target += (float)(num_points)*center_point_array[4];

 }

 #endif // LV_HAVE_AVX


 #ifdef LV_HAVE_GENERIC


 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,

                                                        float* src0,

                                                        float* center_point_array,

                                                        float* cutoff,

                                                        unsigned int num_points)

 {

     const unsigned int eighth_points = num_points / 8;


     float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };

     float fst = 0.0f;

     float sq = 0.0f;

     float thrd = 0.0f;

     float frth = 0.0f;


     unsigned int i = 0;

     unsigned int k = 0;

     for (i = 0; i < eighth_points; ++i) {

         for (k = 0; k < 8; ++k) {

             fst = *src0++;

             fst = MAX(fst, *cutoff);

             sq = fst * fst;

             thrd = fst * sq;

             frth = fst * thrd;

             result[k] += center_point_array[0] * fst + center_point_array[1] * sq;

             result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;

         }

     }

     for (k = 0; k < 8; k += 2)

         result[k] = result[k] + result[k + 1];


     *target = result[0] + result[2] + result[4] + result[6];


     for (i = eighth_points * 8; i < num_points; ++i) {

         fst = *src0++;

         fst = MAX(fst, *cutoff);

         sq = fst * fst;

         thrd = fst * sq;

         frth = fst * thrd;

         *target += (center_point_array[0] * fst + center_point_array[1] * sq +

                     center_point_array[2] * thrd + center_point_array[3] * frth);

     }

     *target += (float)(num_points)*center_point_array[4];

 }


 #endif /*LV_HAVE_GENERIC*/


 #ifdef LV_HAVE_NEON

 #include <arm_neon.h>


 static inline void

 volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,

                                    float* __restrict src0,

                                    float* __restrict center_point_array,

                                    float* __restrict cutoff,

                                    unsigned int num_points)

 {

     unsigned int i;

     float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };


     float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;

     float32x2_t cutoff_vector;

     float32x2x2_t x_low, x_high;

     float32x4_t x_qvector, c_qvector, cpa_qvector;

     float accumulator;

     float res_accumulators[4];


     c_qvector = vld1q_f32(zero);

     // load the cutoff in to a vector

     cutoff_vector = vdup_n_f32(*cutoff);

     // ... center point array

     cpa_qvector = vld1q_f32(center_point_array);


     for (i = 0; i < num_points; ++i) {

         // load x  (src0)

         x_to_1 = vdup_n_f32(*src0++);


         // Get a vector of max(src0, cutoff)

         x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1

         x_to_2 = vmul_f32(x_to_1, x_to_1);        // x^2

         x_to_3 = vmul_f32(x_to_2, x_to_1);        // x^3

         x_to_4 = vmul_f32(x_to_3, x_to_1);        // x^4

         // zip up doubles to interleave

         x_low = vzip_f32(x_to_1, x_to_2);  // [x^2 | x^1 || x^2 | x^1]

         x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]

         // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0

         x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);

         // now we finally have [x^4 | x^3 | x^2 | x] !


         c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);

     }

     // there should be better vector reduction techniques

     vst1q_f32(res_accumulators, c_qvector);

     accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +

                   res_accumulators[3];


     *target = accumulator + (float)num_points * center_point_array[4];

 }


 #endif /* LV_HAVE_NEON */


 #ifdef LV_HAVE_NEON


 static inline void

 volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,

                                      float* __restrict src0,

                                      float* __restrict center_point_array,

                                      float* __restrict cutoff,

                                      unsigned int num_points)

 {

     unsigned int i;

     float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };


     float accumulator;


     float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;

     accumulator1_vec = vld1q_f32(zero);

     accumulator2_vec = vld1q_f32(zero);

     accumulator3_vec = vld1q_f32(zero);

     accumulator4_vec = vld1q_f32(zero);

     float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;

     float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;


     // load the cutoff in to a vector

     cutoff_vector = vdupq_n_f32(*cutoff);

     // ... center point array

     cpa_0 = vdupq_n_f32(center_point_array[0]);

     cpa_1 = vdupq_n_f32(center_point_array[1]);

     cpa_2 = vdupq_n_f32(center_point_array[2]);

     cpa_3 = vdupq_n_f32(center_point_array[3]);


     // nathan is not sure why this is slower *and* wrong compared to neonvertfma

     for (i = 0; i < num_points / 4; ++i) {

         // load x

         x_to_1 = vld1q_f32(src0);


         // Get a vector of max(src0, cutoff)

         x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1

         x_to_2 = vmulq_f32(x_to_1, x_to_1);        // x^2

         x_to_3 = vmulq_f32(x_to_2, x_to_1);        // x^3

         x_to_4 = vmulq_f32(x_to_3, x_to_1);        // x^4

         x_to_1 = vmulq_f32(x_to_1, cpa_0);

         x_to_2 = vmulq_f32(x_to_2, cpa_1);

         x_to_3 = vmulq_f32(x_to_3, cpa_2);

         x_to_4 = vmulq_f32(x_to_4, cpa_3);

         accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);

         accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);

         accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);

         accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);


         src0 += 4;

     }

     accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);

     accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);

     accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);


     __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];

     vst1q_f32(res_accumulators, accumulator1_vec);

     accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +

                   res_accumulators[3];


     float fst = 0.0;

     float sq = 0.0;

     float thrd = 0.0;

     float frth = 0.0;


     for (i = 4 * (num_points / 4); i < num_points; ++i) {

         fst = *src0++;

         fst = MAX(fst, *cutoff);


         sq = fst * fst;

         thrd = fst * sq;

         frth = sq * sq;

         // fith = sq * thrd;


         accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +

                         center_point_array[2] * thrd + center_point_array[3] * frth); //+

     }


     *target = accumulator + (float)num_points * center_point_array[4];

 }


 #endif /* LV_HAVE_NEON */


 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/


 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H

 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H


 #include <inttypes.h>

 #include <stdio.h>

 #include <volk/volk_complex.h>


 #ifndef MAX

 #define MAX(X, Y) ((X) > (Y) ? (X) : (Y))

 #endif


 #if LV_HAVE_AVX && LV_HAVE_FMA

 #include <immintrin.h>


 static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,

                                                          float* src0,

                                                          float* center_point_array,

                                                          float* cutoff,

                                                          unsigned int num_points)

 {

     const unsigned int eighth_points = num_points / 8;

     float fst = 0.0;

     float sq = 0.0;

     float thrd = 0.0;

     float frth = 0.0;


     __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;

     __m256 target_vec;

     __m256 x_to_1, x_to_2, x_to_3, x_to_4;


     cpa0 = _mm256_set1_ps(center_point_array[0]);

     cpa1 = _mm256_set1_ps(center_point_array[1]);

     cpa2 = _mm256_set1_ps(center_point_array[2]);

     cpa3 = _mm256_set1_ps(center_point_array[3]);

     cutoff_vec = _mm256_set1_ps(*cutoff);

     target_vec = _mm256_setzero_ps();


     unsigned int i;


     for (i = 0; i < eighth_points; ++i) {

         x_to_1 = _mm256_loadu_ps(src0);

         x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);

         x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2

         x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3

         // x^1 * x^3 is slightly faster than x^2 * x^2

         x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4


         x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2

         x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4


         x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);

         x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);

         // this is slightly faster than result += (x_to_1 + x_to_3)

         target_vec = _mm256_add_ps(x_to_1, target_vec);

         target_vec = _mm256_add_ps(x_to_3, target_vec);


         src0 += 8;

     }


     // the hadd for vector reduction has very very slight impact @ 50k iters

     __VOLK_ATTR_ALIGNED(32) float temp_results[8];

     target_vec = _mm256_hadd_ps(

         target_vec,

         target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7

     _mm256_storeu_ps(temp_results, target_vec);

     *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];


     for (i = eighth_points * 8; i < num_points; ++i) {

         fst = *src0++;

         fst = MAX(fst, *cutoff);

         sq = fst * fst;

         thrd = fst * sq;

         frth = sq * sq;

         *target += (center_point_array[0] * fst + center_point_array[1] * sq +

                     center_point_array[2] * thrd + center_point_array[3] * frth);

     }


     *target += (float)(num_points)*center_point_array[4];

 }

 #endif // LV_HAVE_AVX && LV_HAVE_FMA


 #ifdef LV_HAVE_AVX

 #include <immintrin.h>


 static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,

                                                      float* src0,

                                                      float* center_point_array,

                                                      float* cutoff,

                                                      unsigned int num_points)

 {

     const unsigned int eighth_points = num_points / 8;

     float fst = 0.0;

     float sq = 0.0;

     float thrd = 0.0;

     float frth = 0.0;


     __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;

     __m256 target_vec;

     __m256 x_to_1, x_to_2, x_to_3, x_to_4;


     cpa0 = _mm256_set1_ps(center_point_array[0]);

     cpa1 = _mm256_set1_ps(center_point_array[1]);

     cpa2 = _mm256_set1_ps(center_point_array[2]);

     cpa3 = _mm256_set1_ps(center_point_array[3]);

     cutoff_vec = _mm256_set1_ps(*cutoff);

     target_vec = _mm256_setzero_ps();


     unsigned int i;


     for (i = 0; i < eighth_points; ++i) {

         x_to_1 = _mm256_loadu_ps(src0);

         x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);

         x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2

         x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3

         // x^1 * x^3 is slightly faster than x^2 * x^2

         x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4


         x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1

         x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2

         x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3

         x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4


         x_to_1 = _mm256_add_ps(x_to_1, x_to_2);

         x_to_3 = _mm256_add_ps(x_to_3, x_to_4);

         // this is slightly faster than result += (x_to_1 + x_to_3)

         target_vec = _mm256_add_ps(x_to_1, target_vec);

         target_vec = _mm256_add_ps(x_to_3, target_vec);


         src0 += 8;

     }


     // the hadd for vector reduction has very very slight impact @ 50k iters

     __VOLK_ATTR_ALIGNED(32) float temp_results[8];

     target_vec = _mm256_hadd_ps(

         target_vec,

         target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7

     _mm256_storeu_ps(temp_results, target_vec);

     *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];


     for (i = eighth_points * 8; i < num_points; ++i) {

         fst = *src0++;

         fst = MAX(fst, *cutoff);

         sq = fst * fst;

         thrd = fst * sq;

         frth = sq * sq;


         *target += (center_point_array[0] * fst + center_point_array[1] * sq +

                     center_point_array[2] * thrd + center_point_array[3] * frth);

     }


     *target += (float)(num_points)*center_point_array[4];

 }

 #endif // LV_HAVE_AVX


 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/

__m128
float32x4_t __m128
Definition: sse2neon.h:235

_mm_hadd_ps
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527

_mm_mul_ps
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205

_mm_store_ss
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
Definition: sse2neon.h:2727

_mm_setzero_ps
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531

_mm_add_ps
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039

_mm_load1_ps
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885

_mm_load_ps
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858

_mm_max_ps
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025

volk_32f_x3_sum_of_poly_32f_u_avx
static void volk_32f_x3_sum_of_poly_32f_u_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:587

volk_32f_x3_sum_of_poly_32f_a_sse3
static void volk_32f_x3_sum_of_poly_32f_a_sse3(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:86

volk_32f_x3_sum_of_poly_32f_neonvert
static void volk_32f_x3_sum_of_poly_32f_neonvert(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:421

volk_32f_x3_sum_of_poly_32f_a_neon
static void volk_32f_x3_sum_of_poly_32f_a_neon(float *__restrict target, float *__restrict src0, float *__restrict center_point_array, float *__restrict cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:367

volk_32f_x3_sum_of_poly_32f_generic
static void volk_32f_x3_sum_of_poly_32f_generic(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:317

volk_32f_x3_sum_of_poly_32f_a_avx
static void volk_32f_x3_sum_of_poly_32f_a_avx(float *target, float *src0, float *center_point_array, float *cutoff, unsigned int num_points)
Definition: volk_32f_x3_sum_of_poly_32f.h:246

MAX
#define MAX(X, Y)
Definition: volk_32f_x3_sum_of_poly_32f.h:79

__VOLK_ATTR_ALIGNED
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65

volk_complex.h

i
for i
Definition: volk_config_fixed.tmpl.h:13