Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_index_min_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_32f_index_min_32u_a_H
53 #define INCLUDED_volk_32f_index_min_32u_a_H
54 
55 #include <inttypes.h>
56 #include <stdio.h>
57 #include <volk/volk_common.h>
58 
59 #ifdef LV_HAVE_SSE4_1
60 #include <smmintrin.h>
61 
62 static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target,
63  const float* source,
64  uint32_t num_points)
65 {
66  const uint32_t quarterPoints = num_points / 4;
67 
68  float* inputPtr = (float*)source;
69 
70  __m128 indexIncrementValues = _mm_set1_ps(4);
71  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
72 
73  float min = source[0];
74  float index = 0;
75  __m128 minValues = _mm_set1_ps(min);
76  __m128 minValuesIndex = _mm_setzero_ps();
77  __m128 compareResults;
78  __m128 currentValues;
79 
80  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
81  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
82 
83  for (uint32_t number = 0; number < quarterPoints; number++) {
84 
85  currentValues = _mm_load_ps(inputPtr);
86  inputPtr += 4;
87  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
88 
89  compareResults = _mm_cmplt_ps(currentValues, minValues);
90 
91  minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
92  minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
93  }
94 
95  // Calculate the smallest value from the remaining 4 points
96  _mm_store_ps(minValuesBuffer, minValues);
97  _mm_store_ps(minIndexesBuffer, minValuesIndex);
98 
99  for (uint32_t number = 0; number < 4; number++) {
100  if (minValuesBuffer[number] < min) {
101  index = minIndexesBuffer[number];
102  min = minValuesBuffer[number];
103  } else if (minValuesBuffer[number] == min) {
104  if (index > minIndexesBuffer[number])
105  index = minIndexesBuffer[number];
106  }
107  }
108 
109  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
110  if (source[number] < min) {
111  index = number;
112  min = source[number];
113  }
114  }
115  target[0] = (uint32_t)index;
116 }
117 
118 #endif /*LV_HAVE_SSE4_1*/
119 
120 
121 #ifdef LV_HAVE_SSE
122 
123 #include <xmmintrin.h>
124 
125 static inline void
126 volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points)
127 {
128  const uint32_t quarterPoints = num_points / 4;
129 
130  float* inputPtr = (float*)source;
131 
132  __m128 indexIncrementValues = _mm_set1_ps(4);
133  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
134 
135  float min = source[0];
136  float index = 0;
137  __m128 minValues = _mm_set1_ps(min);
138  __m128 minValuesIndex = _mm_setzero_ps();
139  __m128 compareResults;
140  __m128 currentValues;
141 
142  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
143  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
144 
145  for (uint32_t number = 0; number < quarterPoints; number++) {
146 
147  currentValues = _mm_load_ps(inputPtr);
148  inputPtr += 4;
149  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
150 
151  compareResults = _mm_cmplt_ps(currentValues, minValues);
152 
153  minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
154  _mm_andnot_ps(compareResults, minValuesIndex));
155 
156  minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
157  _mm_andnot_ps(compareResults, minValues));
158  }
159 
160  // Calculate the smallest value from the remaining 4 points
161  _mm_store_ps(minValuesBuffer, minValues);
162  _mm_store_ps(minIndexesBuffer, minValuesIndex);
163 
164  for (uint32_t number = 0; number < 4; number++) {
165  if (minValuesBuffer[number] < min) {
166  index = minIndexesBuffer[number];
167  min = minValuesBuffer[number];
168  } else if (minValuesBuffer[number] == min) {
169  if (index > minIndexesBuffer[number])
170  index = minIndexesBuffer[number];
171  }
172  }
173 
174  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
175  if (source[number] < min) {
176  index = number;
177  min = source[number];
178  }
179  }
180  target[0] = (uint32_t)index;
181 }
182 
183 #endif /*LV_HAVE_SSE*/
184 
185 
186 #ifdef LV_HAVE_AVX
187 #include <immintrin.h>
188 
189 static inline void
190 volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points)
191 {
192  const uint32_t quarterPoints = num_points / 8;
193 
194  float* inputPtr = (float*)source;
195 
196  __m256 indexIncrementValues = _mm256_set1_ps(8);
197  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
198 
199  float min = source[0];
200  float index = 0;
201  __m256 minValues = _mm256_set1_ps(min);
202  __m256 minValuesIndex = _mm256_setzero_ps();
203  __m256 compareResults;
204  __m256 currentValues;
205 
206  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
207  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
208 
209  for (uint32_t number = 0; number < quarterPoints; number++) {
210  currentValues = _mm256_load_ps(inputPtr);
211  inputPtr += 8;
212  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
213  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
214  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
215  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
216  }
217 
218  // Calculate the smallest value from the remaining 8 points
219  _mm256_store_ps(minValuesBuffer, minValues);
220  _mm256_store_ps(minIndexesBuffer, minValuesIndex);
221 
222  for (uint32_t number = 0; number < 8; number++) {
223  if (minValuesBuffer[number] < min) {
224  index = minIndexesBuffer[number];
225  min = minValuesBuffer[number];
226  } else if (minValuesBuffer[number] == min) {
227  if (index > minIndexesBuffer[number])
228  index = minIndexesBuffer[number];
229  }
230  }
231 
232  for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
233  if (source[number] < min) {
234  index = number;
235  min = source[number];
236  }
237  }
238  target[0] = (uint32_t)index;
239 }
240 
241 #endif /*LV_HAVE_AVX*/
242 
243 
244 #ifdef LV_HAVE_NEON
245 #include <arm_neon.h>
246 
247 static inline void
248 volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points)
249 {
250  const uint32_t quarterPoints = num_points / 4;
251 
252  float* inputPtr = (float*)source;
253  float32x4_t indexIncrementValues = vdupq_n_f32(4);
255  float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
256  float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
257 
258  float min = source[0];
259  float index = 0;
260  float32x4_t minValues = vdupq_n_f32(min);
261  uint32x4_t minValuesIndex = vmovq_n_u32(0);
262  uint32x4_t compareResults;
263  uint32x4_t currentIndexes_u;
264  float32x4_t currentValues;
265 
266  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
267  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
268 
269  for (uint32_t number = 0; number < quarterPoints; number++) {
270  currentValues = vld1q_f32(inputPtr);
271  inputPtr += 4;
272  currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
273  currentIndexes_u = vcvtq_u32_f32(currentIndexes);
274  compareResults = vcgeq_f32(currentValues, minValues);
275  minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex),
276  vbicq_u32(currentIndexes_u, compareResults));
277  minValues = vminq_f32(currentValues, minValues);
278  }
279 
280  // Calculate the smallest value from the remaining 4 points
281  vst1q_f32(minValuesBuffer, minValues);
282  vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex));
283  for (uint32_t number = 0; number < 4; number++) {
284  if (minValuesBuffer[number] < min) {
285  index = minIndexesBuffer[number];
286  min = minValuesBuffer[number];
287  } else if (minValues[number] == min) {
288  if (index > minIndexesBuffer[number])
289  index = minIndexesBuffer[number];
290  }
291  }
292 
293  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
294  if (source[number] < min) {
295  index = number;
296  min = source[number];
297  }
298  }
299  target[0] = (uint32_t)index;
300 }
301 
302 #endif /*LV_HAVE_NEON*/
303 
304 
305 #ifdef LV_HAVE_GENERIC
306 
307 static inline void
308 volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points)
309 {
310  float min = source[0];
311  uint32_t index = 0;
312 
313  for (uint32_t i = 1; i < num_points; ++i) {
314  if (source[i] < min) {
315  index = i;
316  min = source[i];
317  }
318  }
319  target[0] = index;
320 }
321 
322 #endif /*LV_HAVE_GENERIC*/
323 
324 
325 #endif /*INCLUDED_volk_32f_index_min_32u_a_H*/
326 
327 
328 #ifndef INCLUDED_volk_32f_index_min_32u_u_H
329 #define INCLUDED_volk_32f_index_min_32u_u_H
330 
331 #include <inttypes.h>
332 #include <stdio.h>
333 #include <volk/volk_common.h>
334 
335 
336 #ifdef LV_HAVE_AVX
337 #include <immintrin.h>
338 
339 static inline void
340 volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points)
341 {
342  const uint32_t quarterPoints = num_points / 8;
343 
344  float* inputPtr = (float*)source;
345 
346  __m256 indexIncrementValues = _mm256_set1_ps(8);
347  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
348 
349  float min = source[0];
350  float index = 0;
351  __m256 minValues = _mm256_set1_ps(min);
352  __m256 minValuesIndex = _mm256_setzero_ps();
353  __m256 compareResults;
354  __m256 currentValues;
355 
356  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
357  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
358 
359  for (uint32_t number = 0; number < quarterPoints; number++) {
360  currentValues = _mm256_loadu_ps(inputPtr);
361  inputPtr += 8;
362  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
363  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
364  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
365  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
366  }
367 
368  // Calculate the smalles value from the remaining 8 points
369  _mm256_store_ps(minValuesBuffer, minValues);
370  _mm256_store_ps(minIndexesBuffer, minValuesIndex);
371 
372  for (uint32_t number = 0; number < 8; number++) {
373  if (minValuesBuffer[number] < min) {
374  index = minIndexesBuffer[number];
375  min = minValuesBuffer[number];
376  } else if (minValuesBuffer[number] == min) {
377  if (index > minIndexesBuffer[number])
378  index = minIndexesBuffer[number];
379  }
380  }
381 
382  for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
383  if (source[number] < min) {
384  index = number;
385  min = source[number];
386  }
387  }
388  target[0] = (uint32_t)index;
389 }
390 
391 #endif /*LV_HAVE_AVX*/
392 
393 
394 #ifdef LV_HAVE_SSE4_1
395 #include <smmintrin.h>
396 
397 static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target,
398  const float* source,
399  uint32_t num_points)
400 {
401  const uint32_t quarterPoints = num_points / 4;
402 
403  float* inputPtr = (float*)source;
404 
405  __m128 indexIncrementValues = _mm_set1_ps(4);
406  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
407 
408  float min = source[0];
409  float index = 0;
410  __m128 minValues = _mm_set1_ps(min);
411  __m128 minValuesIndex = _mm_setzero_ps();
412  __m128 compareResults;
413  __m128 currentValues;
414 
415  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
416  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
417 
418  for (uint32_t number = 0; number < quarterPoints; number++) {
419  currentValues = _mm_loadu_ps(inputPtr);
420  inputPtr += 4;
421  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
422  compareResults = _mm_cmplt_ps(currentValues, minValues);
423  minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
424  minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
425  }
426 
427  // Calculate the smallest value from the remaining 4 points
428  _mm_store_ps(minValuesBuffer, minValues);
429  _mm_store_ps(minIndexesBuffer, minValuesIndex);
430 
431  for (uint32_t number = 0; number < 4; number++) {
432  if (minValuesBuffer[number] < min) {
433  index = minIndexesBuffer[number];
434  min = minValuesBuffer[number];
435  } else if (minValuesBuffer[number] == min) {
436  if (index > minIndexesBuffer[number])
437  index = minIndexesBuffer[number];
438  }
439  }
440 
441  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
442  if (source[number] < min) {
443  index = number;
444  min = source[number];
445  }
446  }
447  target[0] = (uint32_t)index;
448 }
449 
450 #endif /*LV_HAVE_SSE4_1*/
451 
452 #ifdef LV_HAVE_SSE
453 #include <xmmintrin.h>
454 
455 static inline void
456 volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points)
457 {
458  const uint32_t quarterPoints = num_points / 4;
459 
460  float* inputPtr = (float*)source;
461 
462  __m128 indexIncrementValues = _mm_set1_ps(4);
463  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
464 
465  float min = source[0];
466  float index = 0;
467  __m128 minValues = _mm_set1_ps(min);
468  __m128 minValuesIndex = _mm_setzero_ps();
469  __m128 compareResults;
470  __m128 currentValues;
471 
472  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
473  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
474 
475  for (uint32_t number = 0; number < quarterPoints; number++) {
476  currentValues = _mm_loadu_ps(inputPtr);
477  inputPtr += 4;
478  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
479  compareResults = _mm_cmplt_ps(currentValues, minValues);
480  minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
481  _mm_andnot_ps(compareResults, minValuesIndex));
482  minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
483  _mm_andnot_ps(compareResults, minValues));
484  }
485 
486  // Calculate the smallest value from the remaining 4 points
487  _mm_store_ps(minValuesBuffer, minValues);
488  _mm_store_ps(minIndexesBuffer, minValuesIndex);
489 
490  for (uint32_t number = 0; number < 4; number++) {
491  if (minValuesBuffer[number] < min) {
492  index = minIndexesBuffer[number];
493  min = minValuesBuffer[number];
494  } else if (minValuesBuffer[number] == min) {
495  if (index > minIndexesBuffer[number])
496  index = minIndexesBuffer[number];
497  }
498  }
499 
500  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
501  if (source[number] < min) {
502  index = number;
503  min = source[number];
504  }
505  }
506  target[0] = (uint32_t)index;
507 }
508 
509 #endif /*LV_HAVE_SSE*/
510 
511 #endif /*INCLUDED_volk_32f_index_min_32u_u_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_index_min_32u_neon(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:248
static void volk_32f_index_min_32u_a_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:126
static void volk_32f_index_min_32u_u_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:456
static void volk_32f_index_min_32u_generic(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:308
static void volk_32f_index_min_32u_a_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:190
static void volk_32f_index_min_32u_u_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:340
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
for i
Definition: volk_config_fixed.tmpl.h:13