32 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
33 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
35 #ifdef LV_HAVE_GENERIC
38 const float* inputVector,
39 const float lower_bound,
40 const float upper_bound,
41 unsigned int num_points)
43 float* outPtr = outputVector;
45 const float distance = upper_bound - lower_bound;
47 for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
49 if (
val < lower_bound) {
50 float excess = lower_bound -
val;
51 signed int count = (int)(excess / distance);
52 *outPtr =
val + (count + 1) * distance;
53 }
else if (
val > upper_bound) {
54 float excess =
val - upper_bound;
55 signed int count = (int)(excess / distance);
56 *outPtr =
val - (count + 1) * distance;
66 #include <xmmintrin.h>
69 const float* inputVector,
70 const float lower_bound,
71 const float upper_bound,
72 unsigned int num_points)
74 const __m256 lower = _mm256_set1_ps(lower_bound);
75 const __m256 upper = _mm256_set1_ps(upper_bound);
76 const __m256 distance = _mm256_sub_ps(upper, lower);
78 __m256 is_smaller, is_bigger;
81 const float* inPtr = inputVector;
82 float* outPtr = outputVector;
83 const size_t eight_points = num_points / 8;
84 for (
size_t counter = 0; counter < eight_points; counter++) {
85 input = _mm256_loadu_ps(inPtr);
87 is_smaller = _mm256_cmp_ps(
88 input, lower, _CMP_LT_OQ);
89 is_bigger = _mm256_cmp_ps(
90 input, upper, _CMP_GT_OQ);
92 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
94 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
96 excess = _mm256_div_ps(excess, distance);
98 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
100 adj = _mm256_set1_ps(1.0f);
101 excess = _mm256_add_ps(excess, adj);
103 adj = _mm256_and_ps(adj, is_smaller);
104 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
106 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
107 output = _mm256_add_ps(input, excess);
108 _mm256_storeu_ps(outPtr, output);
114 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
117 const float* inputVector,
118 const float lower_bound,
119 const float upper_bound,
120 unsigned int num_points)
122 const __m256 lower = _mm256_set1_ps(lower_bound);
123 const __m256 upper = _mm256_set1_ps(upper_bound);
124 const __m256 distance = _mm256_sub_ps(upper, lower);
125 __m256 input, output;
126 __m256 is_smaller, is_bigger;
129 const float* inPtr = inputVector;
130 float* outPtr = outputVector;
131 const size_t eight_points = num_points / 8;
132 for (
size_t counter = 0; counter < eight_points; counter++) {
133 input = _mm256_load_ps(inPtr);
135 is_smaller = _mm256_cmp_ps(
136 input, lower, _CMP_LT_OQ);
137 is_bigger = _mm256_cmp_ps(
138 input, upper, _CMP_GT_OQ);
140 excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
142 _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
144 excess = _mm256_div_ps(excess, distance);
146 excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
148 adj = _mm256_set1_ps(1.0f);
149 excess = _mm256_add_ps(excess, adj);
151 adj = _mm256_and_ps(adj, is_smaller);
152 adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
154 excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
155 output = _mm256_add_ps(input, excess);
156 _mm256_store_ps(outPtr, output);
162 outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
168 #include <xmmintrin.h>
171 const float* inputVector,
172 const float lower_bound,
173 const float upper_bound,
174 unsigned int num_points)
180 __m128 is_smaller, is_bigger;
183 const float* inPtr = inputVector;
184 float* outPtr = outputVector;
185 const size_t quarter_points = num_points / 4;
186 for (
size_t counter = 0; counter < quarter_points; counter++) {
213 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
216 const float* inputVector,
217 const float lower_bound,
218 const float upper_bound,
219 unsigned int num_points)
225 __m128 is_smaller, is_bigger;
228 const float* inPtr = inputVector;
229 float* outPtr = outputVector;
230 const size_t quarter_points = num_points / 4;
231 for (
size_t counter = 0; counter < quarter_points; counter++) {
259 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
264 #include <xmmintrin.h>
267 const float* inputVector,
268 const float lower_bound,
269 const float upper_bound,
270 unsigned int num_points)
276 __m128 is_smaller, is_bigger;
280 const float* inPtr = inputVector;
281 float* outPtr = outputVector;
282 const size_t quarter_points = num_points / 4;
283 for (
size_t counter = 0; counter < quarter_points; counter++) {
311 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
314 const float* inputVector,
315 const float lower_bound,
316 const float upper_bound,
317 unsigned int num_points)
323 __m128 is_smaller, is_bigger;
327 const float* inPtr = inputVector;
328 float* outPtr = outputVector;
329 const size_t quarter_points = num_points / 4;
330 for (
size_t counter = 0; counter < quarter_points; counter++) {
358 outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
val
Definition: volk_arch_defs.py:57
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:266
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:116
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:313
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:170
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:37
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:68
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:215