doxygen/volk__32f__s32f__s32f__mod__range__32f_8h_source.html

 /* -*- c++ -*- */

 /*

  * Copyright 2017 Free Software Foundation, Inc.

  *

  * This file is part of VOLK

  *

  * SPDX-License-Identifier: LGPL-3.0-or-later

  */


 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H

 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H


 #ifdef LV_HAVE_GENERIC


 static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,

                                                             const float* inputVector,

                                                             const float lower_bound,

                                                             const float upper_bound,

                                                             unsigned int num_points)

 {

     float* outPtr = outputVector;

     const float* inPtr;

     const float distance = upper_bound - lower_bound;


     for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {

         float val = *inPtr;

         if (val < lower_bound) {

             float excess = lower_bound - val;

             signed int count = (int)(excess / distance);

             *outPtr = val + (count + 1) * distance;

         } else if (val > upper_bound) {

             float excess = val - upper_bound;

             signed int count = (int)(excess / distance);

             *outPtr = val - (count + 1) * distance;

         } else

             *outPtr = val;

         outPtr++;

     }

 }

 #endif /* LV_HAVE_GENERIC */


 #ifdef LV_HAVE_AVX

 #include <xmmintrin.h>


 static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,

                                                           const float* inputVector,

                                                           const float lower_bound,

                                                           const float upper_bound,

                                                           unsigned int num_points)

 {

     const __m256 lower = _mm256_set1_ps(lower_bound);

     const __m256 upper = _mm256_set1_ps(upper_bound);

     const __m256 distance = _mm256_sub_ps(upper, lower);

     __m256 input, output;

     __m256 is_smaller, is_bigger;

     __m256 excess, adj;


     const float* inPtr = inputVector;

     float* outPtr = outputVector;

     const size_t eight_points = num_points / 8;

     for (size_t counter = 0; counter < eight_points; counter++) {

         input = _mm256_loadu_ps(inPtr);

         // calculate mask: input < lower, input > upper

         is_smaller = _mm256_cmp_ps(

             input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling

         is_bigger = _mm256_cmp_ps(

             input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling

         // find out how far we are out-of-bound – positive values!

         excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);

         excess =

             _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);

         // how many do we have to add? (int(excess/distance+1)*distance)

         excess = _mm256_div_ps(excess, distance);

         // round down

         excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));

         // plus 1

         adj = _mm256_set1_ps(1.0f);

         excess = _mm256_add_ps(excess, adj);

         // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}

         adj = _mm256_and_ps(adj, is_smaller);

         adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);

         // scale by distance, sign

         excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);

         output = _mm256_add_ps(input, excess);

         _mm256_storeu_ps(outPtr, output);

         inPtr += 8;

         outPtr += 8;

     }


     volk_32f_s32f_s32f_mod_range_32f_generic(

         outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);

 }

 static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,

                                                           const float* inputVector,

                                                           const float lower_bound,

                                                           const float upper_bound,

                                                           unsigned int num_points)

 {

     const __m256 lower = _mm256_set1_ps(lower_bound);

     const __m256 upper = _mm256_set1_ps(upper_bound);

     const __m256 distance = _mm256_sub_ps(upper, lower);

     __m256 input, output;

     __m256 is_smaller, is_bigger;

     __m256 excess, adj;


     const float* inPtr = inputVector;

     float* outPtr = outputVector;

     const size_t eight_points = num_points / 8;

     for (size_t counter = 0; counter < eight_points; counter++) {

         input = _mm256_load_ps(inPtr);

         // calculate mask: input < lower, input > upper

         is_smaller = _mm256_cmp_ps(

             input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling

         is_bigger = _mm256_cmp_ps(

             input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling

         // find out how far we are out-of-bound – positive values!

         excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);

         excess =

             _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);

         // how many do we have to add? (int(excess/distance+1)*distance)

         excess = _mm256_div_ps(excess, distance);

         // round down

         excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));

         // plus 1

         adj = _mm256_set1_ps(1.0f);

         excess = _mm256_add_ps(excess, adj);

         // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}

         adj = _mm256_and_ps(adj, is_smaller);

         adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);

         // scale by distance, sign

         excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);

         output = _mm256_add_ps(input, excess);

         _mm256_store_ps(outPtr, output);

         inPtr += 8;

         outPtr += 8;

     }


     volk_32f_s32f_s32f_mod_range_32f_generic(

         outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);

 }

 #endif /* LV_HAVE_AVX */


 #ifdef LV_HAVE_SSE2

 #include <xmmintrin.h>


 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,

                                                            const float* inputVector,

                                                            const float lower_bound,

                                                            const float upper_bound,

                                                            unsigned int num_points)

 {

     const __m128 lower = _mm_set_ps1(lower_bound);

     const __m128 upper = _mm_set_ps1(upper_bound);

     const __m128 distance = _mm_sub_ps(upper, lower);

     __m128 input, output;

     __m128 is_smaller, is_bigger;

     __m128 excess, adj;


     const float* inPtr = inputVector;

     float* outPtr = outputVector;

     const size_t quarter_points = num_points / 4;

     for (size_t counter = 0; counter < quarter_points; counter++) {

         input = _mm_load_ps(inPtr);

         // calculate mask: input < lower, input > upper

         is_smaller = _mm_cmplt_ps(input, lower);

         is_bigger = _mm_cmpgt_ps(input, upper);

         // find out how far we are out-of-bound – positive values!

         excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);

         excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);

         // how many do we have to add? (int(excess/distance+1)*distance)

         excess = _mm_div_ps(excess, distance);

         // round down

         excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));

         // plus 1

         adj = _mm_set_ps1(1.0f);

         excess = _mm_add_ps(excess, adj);

         // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}

         adj = _mm_and_ps(adj, is_smaller);

         adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);

         // scale by distance, sign

         excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);

         output = _mm_add_ps(input, excess);

         _mm_store_ps(outPtr, output);

         inPtr += 4;

         outPtr += 4;

     }


     volk_32f_s32f_s32f_mod_range_32f_generic(

         outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);

 }

 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,

                                                            const float* inputVector,

                                                            const float lower_bound,

                                                            const float upper_bound,

                                                            unsigned int num_points)

 {

     const __m128 lower = _mm_set_ps1(lower_bound);

     const __m128 upper = _mm_set_ps1(upper_bound);

     const __m128 distance = _mm_sub_ps(upper, lower);

     __m128 input, output;

     __m128 is_smaller, is_bigger;

     __m128 excess, adj;


     const float* inPtr = inputVector;

     float* outPtr = outputVector;

     const size_t quarter_points = num_points / 4;

     for (size_t counter = 0; counter < quarter_points; counter++) {

         input = _mm_load_ps(inPtr);

         // calculate mask: input < lower, input > upper

         is_smaller = _mm_cmplt_ps(input, lower);

         is_bigger = _mm_cmpgt_ps(input, upper);

         // find out how far we are out-of-bound – positive values!

         excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);

         excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);

         // how many do we have to add? (int(excess/distance+1)*distance)

         excess = _mm_div_ps(excess, distance);

         // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32

         // conversion.

         excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));

         // plus 1

         adj = _mm_set_ps1(1.0f);

         excess = _mm_add_ps(excess, adj);

         // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}

         adj = _mm_and_ps(adj, is_smaller);

         adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);

         // scale by distance, sign

         excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);

         output = _mm_add_ps(input, excess);

         _mm_store_ps(outPtr, output);

         inPtr += 4;

         outPtr += 4;

     }


     volk_32f_s32f_s32f_mod_range_32f_generic(

         outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);

 }

 #endif /* LV_HAVE_SSE2 */


 #ifdef LV_HAVE_SSE

 #include <xmmintrin.h>


 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,

                                                           const float* inputVector,

                                                           const float lower_bound,

                                                           const float upper_bound,

                                                           unsigned int num_points)

 {

     const __m128 lower = _mm_set_ps1(lower_bound);

     const __m128 upper = _mm_set_ps1(upper_bound);

     const __m128 distance = _mm_sub_ps(upper, lower);

     __m128 input, output;

     __m128 is_smaller, is_bigger;

     __m128 excess, adj;

     __m128i rounddown;


     const float* inPtr = inputVector;

     float* outPtr = outputVector;

     const size_t quarter_points = num_points / 4;

     for (size_t counter = 0; counter < quarter_points; counter++) {

         input = _mm_load_ps(inPtr);

         // calculate mask: input < lower, input > upper

         is_smaller = _mm_cmplt_ps(input, lower);

         is_bigger = _mm_cmpgt_ps(input, upper);

         // find out how far we are out-of-bound – positive values!

         excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);

         excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);

         // how many do we have to add? (int(excess/distance+1)*distance)

         excess = _mm_div_ps(excess, distance);

         // round down – for some reason

         rounddown = _mm_cvttps_epi32(excess);

         excess = _mm_cvtepi32_ps(rounddown);

         // plus 1

         adj = _mm_set_ps1(1.0f);

         excess = _mm_add_ps(excess, adj);

         // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}

         adj = _mm_and_ps(adj, is_smaller);

         adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);

         // scale by distance, sign

         excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);

         output = _mm_add_ps(input, excess);

         _mm_store_ps(outPtr, output);

         inPtr += 4;

         outPtr += 4;

     }


     volk_32f_s32f_s32f_mod_range_32f_generic(

         outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);

 }

 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,

                                                           const float* inputVector,

                                                           const float lower_bound,

                                                           const float upper_bound,

                                                           unsigned int num_points)

 {

     const __m128 lower = _mm_set_ps1(lower_bound);

     const __m128 upper = _mm_set_ps1(upper_bound);

     const __m128 distance = _mm_sub_ps(upper, lower);

     __m128 input, output;

     __m128 is_smaller, is_bigger;

     __m128 excess, adj;

     __m128i rounddown;


     const float* inPtr = inputVector;

     float* outPtr = outputVector;

     const size_t quarter_points = num_points / 4;

     for (size_t counter = 0; counter < quarter_points; counter++) {

         input = _mm_load_ps(inPtr);

         // calculate mask: input < lower, input > upper

         is_smaller = _mm_cmplt_ps(input, lower);

         is_bigger = _mm_cmpgt_ps(input, upper);

         // find out how far we are out-of-bound – positive values!

         excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);

         excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);

         // how many do we have to add? (int(excess/distance+1)*distance)

         excess = _mm_div_ps(excess, distance);

         // round down

         rounddown = _mm_cvttps_epi32(excess);

         excess = _mm_cvtepi32_ps(rounddown);

         // plus 1

         adj = _mm_set_ps1(1.0f);

         excess = _mm_add_ps(excess, adj);

         // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}

         adj = _mm_and_ps(adj, is_smaller);

         adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);

         // scale by distance, sign

         excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);

         output = _mm_add_ps(input, excess);

         _mm_store_ps(outPtr, output);

         inPtr += 4;

         outPtr += 4;

     }


     volk_32f_s32f_s32f_mod_range_32f_generic(

         outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);

 }

 #endif /* LV_HAVE_SSE */


 #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */

volk_arch_defs.val
val
Definition: volk_arch_defs.py:57

_mm_sub_ps
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834

__m128
float32x4_t __m128
Definition: sse2neon.h:235

_mm_div_ps
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756

_mm_mul_ps
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205

_mm_cvttps_epi32
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324

_mm_set_ps1
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437

_mm_cmpgt_ps
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154

_mm_and_ps
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064

_mm_add_ps
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039

_mm_cmplt_ps
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190

_mm_load_ps
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858

__m128i
int64x2_t __m128i
Definition: sse2neon.h:244

_mm_store_ps
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704

_mm_cvtepi32_ps
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937

_mm_or_ps
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237

volk_32f_s32f_s32f_mod_range_32f_u_sse
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:266

volk_32f_s32f_s32f_mod_range_32f_a_avx
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:116

volk_32f_s32f_s32f_mod_range_32f_a_sse
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:313

volk_32f_s32f_s32f_mod_range_32f_u_sse2
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:170

volk_32f_s32f_s32f_mod_range_32f_generic
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:37

volk_32f_s32f_s32f_mod_range_32f_u_avx
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:68

volk_32f_s32f_s32f_mod_range_32f_a_sse2
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:215