Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_s32f_mod_range_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2017 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
32 #ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
33 #define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
34 
35 #ifdef LV_HAVE_GENERIC
36 
37 static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
38  const float* inputVector,
39  const float lower_bound,
40  const float upper_bound,
41  unsigned int num_points)
42 {
43  float* outPtr = outputVector;
44  const float* inPtr;
45  const float distance = upper_bound - lower_bound;
46 
47  for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
48  float val = *inPtr;
49  if (val < lower_bound) {
50  float excess = lower_bound - val;
51  signed int count = (int)(excess / distance);
52  *outPtr = val + (count + 1) * distance;
53  } else if (val > upper_bound) {
54  float excess = val - upper_bound;
55  signed int count = (int)(excess / distance);
56  *outPtr = val - (count + 1) * distance;
57  } else
58  *outPtr = val;
59  outPtr++;
60  }
61 }
62 #endif /* LV_HAVE_GENERIC */
63 
64 
65 #ifdef LV_HAVE_AVX
66 #include <xmmintrin.h>
67 
68 static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
69  const float* inputVector,
70  const float lower_bound,
71  const float upper_bound,
72  unsigned int num_points)
73 {
74  const __m256 lower = _mm256_set1_ps(lower_bound);
75  const __m256 upper = _mm256_set1_ps(upper_bound);
76  const __m256 distance = _mm256_sub_ps(upper, lower);
77  __m256 input, output;
78  __m256 is_smaller, is_bigger;
79  __m256 excess, adj;
80 
81  const float* inPtr = inputVector;
82  float* outPtr = outputVector;
83  const size_t eight_points = num_points / 8;
84  for (size_t counter = 0; counter < eight_points; counter++) {
85  input = _mm256_loadu_ps(inPtr);
86  // calculate mask: input < lower, input > upper
87  is_smaller = _mm256_cmp_ps(
88  input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
89  is_bigger = _mm256_cmp_ps(
90  input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
91  // find out how far we are out-of-bound – positive values!
92  excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
93  excess =
94  _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
95  // how many do we have to add? (int(excess/distance+1)*distance)
96  excess = _mm256_div_ps(excess, distance);
97  // round down
98  excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
99  // plus 1
100  adj = _mm256_set1_ps(1.0f);
101  excess = _mm256_add_ps(excess, adj);
102  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
103  adj = _mm256_and_ps(adj, is_smaller);
104  adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
105  // scale by distance, sign
106  excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
107  output = _mm256_add_ps(input, excess);
108  _mm256_storeu_ps(outPtr, output);
109  inPtr += 8;
110  outPtr += 8;
111  }
112 
114  outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
115 }
116 static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
117  const float* inputVector,
118  const float lower_bound,
119  const float upper_bound,
120  unsigned int num_points)
121 {
122  const __m256 lower = _mm256_set1_ps(lower_bound);
123  const __m256 upper = _mm256_set1_ps(upper_bound);
124  const __m256 distance = _mm256_sub_ps(upper, lower);
125  __m256 input, output;
126  __m256 is_smaller, is_bigger;
127  __m256 excess, adj;
128 
129  const float* inPtr = inputVector;
130  float* outPtr = outputVector;
131  const size_t eight_points = num_points / 8;
132  for (size_t counter = 0; counter < eight_points; counter++) {
133  input = _mm256_load_ps(inPtr);
134  // calculate mask: input < lower, input > upper
135  is_smaller = _mm256_cmp_ps(
136  input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
137  is_bigger = _mm256_cmp_ps(
138  input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
139  // find out how far we are out-of-bound – positive values!
140  excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
141  excess =
142  _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
143  // how many do we have to add? (int(excess/distance+1)*distance)
144  excess = _mm256_div_ps(excess, distance);
145  // round down
146  excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
147  // plus 1
148  adj = _mm256_set1_ps(1.0f);
149  excess = _mm256_add_ps(excess, adj);
150  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
151  adj = _mm256_and_ps(adj, is_smaller);
152  adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
153  // scale by distance, sign
154  excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
155  output = _mm256_add_ps(input, excess);
156  _mm256_store_ps(outPtr, output);
157  inPtr += 8;
158  outPtr += 8;
159  }
160 
162  outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
163 }
164 #endif /* LV_HAVE_AVX */
165 
166 
167 #ifdef LV_HAVE_SSE2
168 #include <xmmintrin.h>
169 
170 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
171  const float* inputVector,
172  const float lower_bound,
173  const float upper_bound,
174  unsigned int num_points)
175 {
176  const __m128 lower = _mm_set_ps1(lower_bound);
177  const __m128 upper = _mm_set_ps1(upper_bound);
178  const __m128 distance = _mm_sub_ps(upper, lower);
179  __m128 input, output;
180  __m128 is_smaller, is_bigger;
181  __m128 excess, adj;
182 
183  const float* inPtr = inputVector;
184  float* outPtr = outputVector;
185  const size_t quarter_points = num_points / 4;
186  for (size_t counter = 0; counter < quarter_points; counter++) {
187  input = _mm_load_ps(inPtr);
188  // calculate mask: input < lower, input > upper
189  is_smaller = _mm_cmplt_ps(input, lower);
190  is_bigger = _mm_cmpgt_ps(input, upper);
191  // find out how far we are out-of-bound – positive values!
192  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
193  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
194  // how many do we have to add? (int(excess/distance+1)*distance)
195  excess = _mm_div_ps(excess, distance);
196  // round down
197  excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
198  // plus 1
199  adj = _mm_set_ps1(1.0f);
200  excess = _mm_add_ps(excess, adj);
201  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
202  adj = _mm_and_ps(adj, is_smaller);
203  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
204  // scale by distance, sign
205  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
206  output = _mm_add_ps(input, excess);
207  _mm_store_ps(outPtr, output);
208  inPtr += 4;
209  outPtr += 4;
210  }
211 
213  outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
214 }
215 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
216  const float* inputVector,
217  const float lower_bound,
218  const float upper_bound,
219  unsigned int num_points)
220 {
221  const __m128 lower = _mm_set_ps1(lower_bound);
222  const __m128 upper = _mm_set_ps1(upper_bound);
223  const __m128 distance = _mm_sub_ps(upper, lower);
224  __m128 input, output;
225  __m128 is_smaller, is_bigger;
226  __m128 excess, adj;
227 
228  const float* inPtr = inputVector;
229  float* outPtr = outputVector;
230  const size_t quarter_points = num_points / 4;
231  for (size_t counter = 0; counter < quarter_points; counter++) {
232  input = _mm_load_ps(inPtr);
233  // calculate mask: input < lower, input > upper
234  is_smaller = _mm_cmplt_ps(input, lower);
235  is_bigger = _mm_cmpgt_ps(input, upper);
236  // find out how far we are out-of-bound – positive values!
237  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
238  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
239  // how many do we have to add? (int(excess/distance+1)*distance)
240  excess = _mm_div_ps(excess, distance);
241  // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
242  // conversion.
243  excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
244  // plus 1
245  adj = _mm_set_ps1(1.0f);
246  excess = _mm_add_ps(excess, adj);
247  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
248  adj = _mm_and_ps(adj, is_smaller);
249  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
250  // scale by distance, sign
251  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
252  output = _mm_add_ps(input, excess);
253  _mm_store_ps(outPtr, output);
254  inPtr += 4;
255  outPtr += 4;
256  }
257 
259  outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
260 }
261 #endif /* LV_HAVE_SSE2 */
262 
263 #ifdef LV_HAVE_SSE
264 #include <xmmintrin.h>
265 
266 static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
267  const float* inputVector,
268  const float lower_bound,
269  const float upper_bound,
270  unsigned int num_points)
271 {
272  const __m128 lower = _mm_set_ps1(lower_bound);
273  const __m128 upper = _mm_set_ps1(upper_bound);
274  const __m128 distance = _mm_sub_ps(upper, lower);
275  __m128 input, output;
276  __m128 is_smaller, is_bigger;
277  __m128 excess, adj;
278  __m128i rounddown;
279 
280  const float* inPtr = inputVector;
281  float* outPtr = outputVector;
282  const size_t quarter_points = num_points / 4;
283  for (size_t counter = 0; counter < quarter_points; counter++) {
284  input = _mm_load_ps(inPtr);
285  // calculate mask: input < lower, input > upper
286  is_smaller = _mm_cmplt_ps(input, lower);
287  is_bigger = _mm_cmpgt_ps(input, upper);
288  // find out how far we are out-of-bound – positive values!
289  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
290  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
291  // how many do we have to add? (int(excess/distance+1)*distance)
292  excess = _mm_div_ps(excess, distance);
293  // round down – for some reason
294  rounddown = _mm_cvttps_epi32(excess);
295  excess = _mm_cvtepi32_ps(rounddown);
296  // plus 1
297  adj = _mm_set_ps1(1.0f);
298  excess = _mm_add_ps(excess, adj);
299  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
300  adj = _mm_and_ps(adj, is_smaller);
301  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
302  // scale by distance, sign
303  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
304  output = _mm_add_ps(input, excess);
305  _mm_store_ps(outPtr, output);
306  inPtr += 4;
307  outPtr += 4;
308  }
309 
311  outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
312 }
313 static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
314  const float* inputVector,
315  const float lower_bound,
316  const float upper_bound,
317  unsigned int num_points)
318 {
319  const __m128 lower = _mm_set_ps1(lower_bound);
320  const __m128 upper = _mm_set_ps1(upper_bound);
321  const __m128 distance = _mm_sub_ps(upper, lower);
322  __m128 input, output;
323  __m128 is_smaller, is_bigger;
324  __m128 excess, adj;
325  __m128i rounddown;
326 
327  const float* inPtr = inputVector;
328  float* outPtr = outputVector;
329  const size_t quarter_points = num_points / 4;
330  for (size_t counter = 0; counter < quarter_points; counter++) {
331  input = _mm_load_ps(inPtr);
332  // calculate mask: input < lower, input > upper
333  is_smaller = _mm_cmplt_ps(input, lower);
334  is_bigger = _mm_cmpgt_ps(input, upper);
335  // find out how far we are out-of-bound – positive values!
336  excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
337  excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
338  // how many do we have to add? (int(excess/distance+1)*distance)
339  excess = _mm_div_ps(excess, distance);
340  // round down
341  rounddown = _mm_cvttps_epi32(excess);
342  excess = _mm_cvtepi32_ps(rounddown);
343  // plus 1
344  adj = _mm_set_ps1(1.0f);
345  excess = _mm_add_ps(excess, adj);
346  // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
347  adj = _mm_and_ps(adj, is_smaller);
348  adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
349  // scale by distance, sign
350  excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
351  output = _mm_add_ps(input, excess);
352  _mm_store_ps(outPtr, output);
353  inPtr += 4;
354  outPtr += 4;
355  }
356 
358  outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
359 }
360 #endif /* LV_HAVE_SSE */
361 
362 
363 #endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
val
Definition: volk_arch_defs.py:57
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
Definition: sse2neon.h:4324
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_s32f_s32f_mod_range_32f_u_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:266
static void volk_32f_s32f_s32f_mod_range_32f_a_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:116
static void volk_32f_s32f_s32f_mod_range_32f_a_sse(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:313
static void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:170
static void volk_32f_s32f_s32f_mod_range_32f_generic(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:37
static void volk_32f_s32f_s32f_mod_range_32f_u_avx(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:68
static void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float *outputVector, const float *inputVector, const float lower_bound, const float upper_bound, unsigned int num_points)
Definition: volk_32f_s32f_s32f_mod_range_32f.h:215