Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_32f_index_max_32u_a_H
53 #define INCLUDED_volk_32f_index_max_32u_a_H
54 
55 #include <inttypes.h>
56 #include <stdio.h>
57 #include <volk/volk_common.h>
58 
59 #ifdef LV_HAVE_SSE4_1
60 #include <smmintrin.h>
61 
62 static inline void
63 volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
64 {
65  if (num_points > 0) {
66  uint32_t number = 0;
67  const uint32_t quarterPoints = num_points / 4;
68 
69  float* inputPtr = (float*)src0;
70 
71  __m128 indexIncrementValues = _mm_set1_ps(4);
72  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
73 
74  float max = src0[0];
75  float index = 0;
76  __m128 maxValues = _mm_set1_ps(max);
77  __m128 maxValuesIndex = _mm_setzero_ps();
78  __m128 compareResults;
79  __m128 currentValues;
80 
81  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
82  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
83 
84  for (; number < quarterPoints; number++) {
85 
86  currentValues = _mm_load_ps(inputPtr);
87  inputPtr += 4;
88  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
89 
90  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
91 
92  maxValuesIndex =
93  _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
94  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
95  }
96 
97  // Calculate the largest value from the remaining 4 points
98  _mm_store_ps(maxValuesBuffer, maxValues);
99  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
100 
101  for (number = 0; number < 4; number++) {
102  if (maxValuesBuffer[number] > max) {
103  index = maxIndexesBuffer[number];
104  max = maxValuesBuffer[number];
105  } else if (maxValuesBuffer[number] == max) {
106  if (index > maxIndexesBuffer[number])
107  index = maxIndexesBuffer[number];
108  }
109  }
110 
111  number = quarterPoints * 4;
112  for (; number < num_points; number++) {
113  if (src0[number] > max) {
114  index = number;
115  max = src0[number];
116  }
117  }
118  target[0] = (uint32_t)index;
119  }
120 }
121 
122 #endif /*LV_HAVE_SSE4_1*/
123 
124 
125 #ifdef LV_HAVE_SSE
126 
127 #include <xmmintrin.h>
128 
129 static inline void
130 volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
131 {
132  if (num_points > 0) {
133  uint32_t number = 0;
134  const uint32_t quarterPoints = num_points / 4;
135 
136  float* inputPtr = (float*)src0;
137 
138  __m128 indexIncrementValues = _mm_set1_ps(4);
139  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
140 
141  float max = src0[0];
142  float index = 0;
143  __m128 maxValues = _mm_set1_ps(max);
144  __m128 maxValuesIndex = _mm_setzero_ps();
145  __m128 compareResults;
146  __m128 currentValues;
147 
148  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
149  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
150 
151  for (; number < quarterPoints; number++) {
152 
153  currentValues = _mm_load_ps(inputPtr);
154  inputPtr += 4;
155  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
156 
157  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
158 
159  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
160  _mm_andnot_ps(compareResults, maxValuesIndex));
161 
162  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
163  _mm_andnot_ps(compareResults, maxValues));
164  }
165 
166  // Calculate the largest value from the remaining 4 points
167  _mm_store_ps(maxValuesBuffer, maxValues);
168  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
169 
170  for (number = 0; number < 4; number++) {
171  if (maxValuesBuffer[number] > max) {
172  index = maxIndexesBuffer[number];
173  max = maxValuesBuffer[number];
174  } else if (maxValuesBuffer[number] == max) {
175  if (index > maxIndexesBuffer[number])
176  index = maxIndexesBuffer[number];
177  }
178  }
179 
180  number = quarterPoints * 4;
181  for (; number < num_points; number++) {
182  if (src0[number] > max) {
183  index = number;
184  max = src0[number];
185  }
186  }
187  target[0] = (uint32_t)index;
188  }
189 }
190 
191 #endif /*LV_HAVE_SSE*/
192 
193 
194 #ifdef LV_HAVE_AVX
195 #include <immintrin.h>
196 
197 static inline void
198 volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
199 {
200  if (num_points > 0) {
201  uint32_t number = 0;
202  const uint32_t quarterPoints = num_points / 8;
203 
204  float* inputPtr = (float*)src0;
205 
206  __m256 indexIncrementValues = _mm256_set1_ps(8);
207  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
208 
209  float max = src0[0];
210  float index = 0;
211  __m256 maxValues = _mm256_set1_ps(max);
212  __m256 maxValuesIndex = _mm256_setzero_ps();
213  __m256 compareResults;
214  __m256 currentValues;
215 
216  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
217  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
218 
219  for (; number < quarterPoints; number++) {
220  currentValues = _mm256_load_ps(inputPtr);
221  inputPtr += 8;
222  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
223  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
224  maxValuesIndex =
225  _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
226  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
227  }
228 
229  // Calculate the largest value from the remaining 8 points
230  _mm256_store_ps(maxValuesBuffer, maxValues);
231  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
232 
233  for (number = 0; number < 8; number++) {
234  if (maxValuesBuffer[number] > max) {
235  index = maxIndexesBuffer[number];
236  max = maxValuesBuffer[number];
237  } else if (maxValuesBuffer[number] == max) {
238  if (index > maxIndexesBuffer[number])
239  index = maxIndexesBuffer[number];
240  }
241  }
242 
243  number = quarterPoints * 8;
244  for (; number < num_points; number++) {
245  if (src0[number] > max) {
246  index = number;
247  max = src0[number];
248  }
249  }
250  target[0] = (uint32_t)index;
251  }
252 }
253 
254 #endif /*LV_HAVE_AVX*/
255 
256 
257 #ifdef LV_HAVE_NEON
258 #include <arm_neon.h>
259 
260 static inline void
261 volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
262 {
263  if (num_points > 0) {
264  uint32_t number = 0;
265  const uint32_t quarterPoints = num_points / 4;
266 
267  float* inputPtr = (float*)src0;
268  float32x4_t indexIncrementValues = vdupq_n_f32(4);
270  float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
271  float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
272 
273  float max = src0[0];
274  float index = 0;
275  float32x4_t maxValues = vdupq_n_f32(max);
276  uint32x4_t maxValuesIndex = vmovq_n_u32(0);
277  uint32x4_t compareResults;
278  uint32x4_t currentIndexes_u;
279  float32x4_t currentValues;
280 
281  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
282  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
283 
284  for (; number < quarterPoints; number++) {
285  currentValues = vld1q_f32(inputPtr);
286  inputPtr += 4;
287  currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
288  currentIndexes_u = vcvtq_u32_f32(currentIndexes);
289  compareResults = vcleq_f32(currentValues, maxValues);
290  maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
291  vbicq_u32(currentIndexes_u, compareResults));
292  maxValues = vmaxq_f32(currentValues, maxValues);
293  }
294 
295  // Calculate the largest value from the remaining 4 points
296  vst1q_f32(maxValuesBuffer, maxValues);
297  vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
298  for (number = 0; number < 4; number++) {
299  if (maxValuesBuffer[number] > max) {
300  index = maxIndexesBuffer[number];
301  max = maxValuesBuffer[number];
302  } else if (maxValues[number] == max) {
303  if (index > maxIndexesBuffer[number])
304  index = maxIndexesBuffer[number];
305  }
306  }
307 
308  number = quarterPoints * 4;
309  for (; number < num_points; number++) {
310  if (src0[number] > max) {
311  index = number;
312  max = src0[number];
313  }
314  }
315  target[0] = (uint32_t)index;
316  }
317 }
318 
319 #endif /*LV_HAVE_NEON*/
320 
321 
322 #ifdef LV_HAVE_GENERIC
323 
324 static inline void
325 volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
326 {
327  if (num_points > 0) {
328  float max = src0[0];
329  uint32_t index = 0;
330 
331  uint32_t i = 1;
332 
333  for (; i < num_points; ++i) {
334  if (src0[i] > max) {
335  index = i;
336  max = src0[i];
337  }
338  }
339  target[0] = index;
340  }
341 }
342 
343 #endif /*LV_HAVE_GENERIC*/
344 
345 
346 #endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
347 
348 
349 #ifndef INCLUDED_volk_32f_index_max_32u_u_H
350 #define INCLUDED_volk_32f_index_max_32u_u_H
351 
352 #include <inttypes.h>
353 #include <stdio.h>
354 #include <volk/volk_common.h>
355 
356 
357 #ifdef LV_HAVE_AVX
358 #include <immintrin.h>
359 
360 static inline void
361 volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
362 {
363  if (num_points > 0) {
364  uint32_t number = 0;
365  const uint32_t quarterPoints = num_points / 8;
366 
367  float* inputPtr = (float*)src0;
368 
369  __m256 indexIncrementValues = _mm256_set1_ps(8);
370  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
371 
372  float max = src0[0];
373  float index = 0;
374  __m256 maxValues = _mm256_set1_ps(max);
375  __m256 maxValuesIndex = _mm256_setzero_ps();
376  __m256 compareResults;
377  __m256 currentValues;
378 
379  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
380  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
381 
382  for (; number < quarterPoints; number++) {
383  currentValues = _mm256_loadu_ps(inputPtr);
384  inputPtr += 8;
385  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
386  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
387  maxValuesIndex =
388  _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
389  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
390  }
391 
392  // Calculate the largest value from the remaining 8 points
393  _mm256_store_ps(maxValuesBuffer, maxValues);
394  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
395 
396  for (number = 0; number < 8; number++) {
397  if (maxValuesBuffer[number] > max) {
398  index = maxIndexesBuffer[number];
399  max = maxValuesBuffer[number];
400  } else if (maxValuesBuffer[number] == max) {
401  if (index > maxIndexesBuffer[number])
402  index = maxIndexesBuffer[number];
403  }
404  }
405 
406  number = quarterPoints * 8;
407  for (; number < num_points; number++) {
408  if (src0[number] > max) {
409  index = number;
410  max = src0[number];
411  }
412  }
413  target[0] = (uint32_t)index;
414  }
415 }
416 
417 #endif /*LV_HAVE_AVX*/
418 
419 
420 #ifdef LV_HAVE_SSE4_1
421 #include <smmintrin.h>
422 
423 static inline void
424 volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
425 {
426  if (num_points > 0) {
427  uint32_t number = 0;
428  const uint32_t quarterPoints = num_points / 4;
429 
430  float* inputPtr = (float*)src0;
431 
432  __m128 indexIncrementValues = _mm_set1_ps(4);
433  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
434 
435  float max = src0[0];
436  float index = 0;
437  __m128 maxValues = _mm_set1_ps(max);
438  __m128 maxValuesIndex = _mm_setzero_ps();
439  __m128 compareResults;
440  __m128 currentValues;
441 
442  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
443  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
444 
445  for (; number < quarterPoints; number++) {
446  currentValues = _mm_loadu_ps(inputPtr);
447  inputPtr += 4;
448  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
449  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
450  maxValuesIndex =
451  _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
452  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
453  }
454 
455  // Calculate the largest value from the remaining 4 points
456  _mm_store_ps(maxValuesBuffer, maxValues);
457  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
458 
459  for (number = 0; number < 4; number++) {
460  if (maxValuesBuffer[number] > max) {
461  index = maxIndexesBuffer[number];
462  max = maxValuesBuffer[number];
463  } else if (maxValuesBuffer[number] == max) {
464  if (index > maxIndexesBuffer[number])
465  index = maxIndexesBuffer[number];
466  }
467  }
468 
469  number = quarterPoints * 4;
470  for (; number < num_points; number++) {
471  if (src0[number] > max) {
472  index = number;
473  max = src0[number];
474  }
475  }
476  target[0] = (uint32_t)index;
477  }
478 }
479 
480 #endif /*LV_HAVE_SSE4_1*/
481 
482 #ifdef LV_HAVE_SSE
483 #include <xmmintrin.h>
484 
485 static inline void
486 volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
487 {
488  if (num_points > 0) {
489  uint32_t number = 0;
490  const uint32_t quarterPoints = num_points / 4;
491 
492  float* inputPtr = (float*)src0;
493 
494  __m128 indexIncrementValues = _mm_set1_ps(4);
495  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
496 
497  float max = src0[0];
498  float index = 0;
499  __m128 maxValues = _mm_set1_ps(max);
500  __m128 maxValuesIndex = _mm_setzero_ps();
501  __m128 compareResults;
502  __m128 currentValues;
503 
504  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
505  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
506 
507  for (; number < quarterPoints; number++) {
508  currentValues = _mm_loadu_ps(inputPtr);
509  inputPtr += 4;
510  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
511  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
512  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
513  _mm_andnot_ps(compareResults, maxValuesIndex));
514  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
515  _mm_andnot_ps(compareResults, maxValues));
516  }
517 
518  // Calculate the largest value from the remaining 4 points
519  _mm_store_ps(maxValuesBuffer, maxValues);
520  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
521 
522  for (number = 0; number < 4; number++) {
523  if (maxValuesBuffer[number] > max) {
524  index = maxIndexesBuffer[number];
525  max = maxValuesBuffer[number];
526  } else if (maxValuesBuffer[number] == max) {
527  if (index > maxIndexesBuffer[number])
528  index = maxIndexesBuffer[number];
529  }
530  }
531 
532  number = quarterPoints * 4;
533  for (; number < num_points; number++) {
534  if (src0[number] > max) {
535  index = number;
536  max = src0[number];
537  }
538  }
539  target[0] = (uint32_t)index;
540  }
541 }
542 
543 #endif /*LV_HAVE_SSE*/
544 
545 #endif /*INCLUDED_volk_32f_index_max_32u_u_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_index_max_32u_neon(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:261
static void volk_32f_index_max_32u_a_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:130
static void volk_32f_index_max_32u_u_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:486
static void volk_32f_index_max_32u_generic(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:325
static void volk_32f_index_max_32u_a_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:198
static void volk_32f_index_max_32u_u_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:361
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
for i
Definition: volk_config_fixed.tmpl.h:13