52 #ifndef INCLUDED_volk_32f_index_max_32u_a_H
53 #define INCLUDED_volk_32f_index_max_32u_a_H
60 #include <smmintrin.h>
63 volk_32f_index_max_32u_a_sse4_1(uint32_t* target,
const float* src0, uint32_t num_points)
67 const uint32_t quarterPoints = num_points / 4;
69 float* inputPtr = (
float*)src0;
84 for (; number < quarterPoints; number++) {
88 currentIndexes =
_mm_add_ps(currentIndexes, indexIncrementValues);
93 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
94 maxValues =
_mm_blendv_ps(maxValues, currentValues, compareResults);
101 for (number = 0; number < 4; number++) {
102 if (maxValuesBuffer[number] > max) {
103 index = maxIndexesBuffer[number];
104 max = maxValuesBuffer[number];
105 }
else if (maxValuesBuffer[number] == max) {
106 if (index > maxIndexesBuffer[number])
107 index = maxIndexesBuffer[number];
111 number = quarterPoints * 4;
112 for (; number < num_points; number++) {
113 if (src0[number] > max) {
118 target[0] = (uint32_t)index;
127 #include <xmmintrin.h>
132 if (num_points > 0) {
134 const uint32_t quarterPoints = num_points / 4;
136 float* inputPtr = (
float*)src0;
151 for (; number < quarterPoints; number++) {
155 currentIndexes =
_mm_add_ps(currentIndexes, indexIncrementValues);
157 compareResults =
_mm_cmpgt_ps(currentValues, maxValues);
170 for (number = 0; number < 4; number++) {
171 if (maxValuesBuffer[number] > max) {
172 index = maxIndexesBuffer[number];
173 max = maxValuesBuffer[number];
174 }
else if (maxValuesBuffer[number] == max) {
175 if (index > maxIndexesBuffer[number])
176 index = maxIndexesBuffer[number];
180 number = quarterPoints * 4;
181 for (; number < num_points; number++) {
182 if (src0[number] > max) {
187 target[0] = (uint32_t)index;
195 #include <immintrin.h>
200 if (num_points > 0) {
202 const uint32_t quarterPoints = num_points / 8;
204 float* inputPtr = (
float*)src0;
206 __m256 indexIncrementValues = _mm256_set1_ps(8);
207 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
211 __m256 maxValues = _mm256_set1_ps(max);
212 __m256 maxValuesIndex = _mm256_setzero_ps();
213 __m256 compareResults;
214 __m256 currentValues;
219 for (; number < quarterPoints; number++) {
220 currentValues = _mm256_load_ps(inputPtr);
222 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
223 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
225 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
226 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
230 _mm256_store_ps(maxValuesBuffer, maxValues);
231 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
233 for (number = 0; number < 8; number++) {
234 if (maxValuesBuffer[number] > max) {
235 index = maxIndexesBuffer[number];
236 max = maxValuesBuffer[number];
237 }
else if (maxValuesBuffer[number] == max) {
238 if (index > maxIndexesBuffer[number])
239 index = maxIndexesBuffer[number];
243 number = quarterPoints * 8;
244 for (; number < num_points; number++) {
245 if (src0[number] > max) {
250 target[0] = (uint32_t)index;
258 #include <arm_neon.h>
263 if (num_points > 0) {
265 const uint32_t quarterPoints = num_points / 4;
267 float* inputPtr = (
float*)src0;
268 float32x4_t indexIncrementValues = vdupq_n_f32(4);
270 float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
271 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
275 float32x4_t maxValues = vdupq_n_f32(max);
276 uint32x4_t maxValuesIndex = vmovq_n_u32(0);
277 uint32x4_t compareResults;
278 uint32x4_t currentIndexes_u;
279 float32x4_t currentValues;
284 for (; number < quarterPoints; number++) {
285 currentValues = vld1q_f32(inputPtr);
287 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
288 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
289 compareResults = vcleq_f32(currentValues, maxValues);
290 maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
291 vbicq_u32(currentIndexes_u, compareResults));
292 maxValues = vmaxq_f32(currentValues, maxValues);
296 vst1q_f32(maxValuesBuffer, maxValues);
297 vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
298 for (number = 0; number < 4; number++) {
299 if (maxValuesBuffer[number] > max) {
300 index = maxIndexesBuffer[number];
301 max = maxValuesBuffer[number];
302 }
else if (maxValues[number] == max) {
303 if (index > maxIndexesBuffer[number])
304 index = maxIndexesBuffer[number];
308 number = quarterPoints * 4;
309 for (; number < num_points; number++) {
310 if (src0[number] > max) {
315 target[0] = (uint32_t)index;
322 #ifdef LV_HAVE_GENERIC
327 if (num_points > 0) {
333 for (;
i < num_points; ++
i) {
349 #ifndef INCLUDED_volk_32f_index_max_32u_u_H
350 #define INCLUDED_volk_32f_index_max_32u_u_H
352 #include <inttypes.h>
358 #include <immintrin.h>
363 if (num_points > 0) {
365 const uint32_t quarterPoints = num_points / 8;
367 float* inputPtr = (
float*)src0;
369 __m256 indexIncrementValues = _mm256_set1_ps(8);
370 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
374 __m256 maxValues = _mm256_set1_ps(max);
375 __m256 maxValuesIndex = _mm256_setzero_ps();
376 __m256 compareResults;
377 __m256 currentValues;
382 for (; number < quarterPoints; number++) {
383 currentValues = _mm256_loadu_ps(inputPtr);
385 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
386 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
388 _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
389 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
393 _mm256_store_ps(maxValuesBuffer, maxValues);
394 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
396 for (number = 0; number < 8; number++) {
397 if (maxValuesBuffer[number] > max) {
398 index = maxIndexesBuffer[number];
399 max = maxValuesBuffer[number];
400 }
else if (maxValuesBuffer[number] == max) {
401 if (index > maxIndexesBuffer[number])
402 index = maxIndexesBuffer[number];
406 number = quarterPoints * 8;
407 for (; number < num_points; number++) {
408 if (src0[number] > max) {
413 target[0] = (uint32_t)index;
420 #ifdef LV_HAVE_SSE4_1
421 #include <smmintrin.h>
424 volk_32f_index_max_32u_u_sse4_1(uint32_t* target,
const float* src0, uint32_t num_points)
426 if (num_points > 0) {
428 const uint32_t quarterPoints = num_points / 4;
430 float* inputPtr = (
float*)src0;
445 for (; number < quarterPoints; number++) {
448 currentIndexes =
_mm_add_ps(currentIndexes, indexIncrementValues);
449 compareResults =
_mm_cmpgt_ps(currentValues, maxValues);
451 _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
452 maxValues =
_mm_blendv_ps(maxValues, currentValues, compareResults);
459 for (number = 0; number < 4; number++) {
460 if (maxValuesBuffer[number] > max) {
461 index = maxIndexesBuffer[number];
462 max = maxValuesBuffer[number];
463 }
else if (maxValuesBuffer[number] == max) {
464 if (index > maxIndexesBuffer[number])
465 index = maxIndexesBuffer[number];
469 number = quarterPoints * 4;
470 for (; number < num_points; number++) {
471 if (src0[number] > max) {
476 target[0] = (uint32_t)index;
483 #include <xmmintrin.h>
488 if (num_points > 0) {
490 const uint32_t quarterPoints = num_points / 4;
492 float* inputPtr = (
float*)src0;
507 for (; number < quarterPoints; number++) {
510 currentIndexes =
_mm_add_ps(currentIndexes, indexIncrementValues);
511 compareResults =
_mm_cmpgt_ps(currentValues, maxValues);
522 for (number = 0; number < 4; number++) {
523 if (maxValuesBuffer[number] > max) {
524 index = maxIndexesBuffer[number];
525 max = maxValuesBuffer[number];
526 }
else if (maxValuesBuffer[number] == max) {
527 if (index > maxIndexesBuffer[number])
528 index = maxIndexesBuffer[number];
532 number = quarterPoints * 4;
533 for (; number < num_points; number++) {
534 if (src0[number] > max) {
539 target[0] = (uint32_t)index;
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_index_max_32u_neon(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:261
static void volk_32f_index_max_32u_a_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:130
static void volk_32f_index_max_32u_u_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:486
static void volk_32f_index_max_32u_generic(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:325
static void volk_32f_index_max_32u_a_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:198
static void volk_32f_index_max_32u_u_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:361
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
for i
Definition: volk_config_fixed.tmpl.h:13