57 #ifndef INCLUDED_volk_32fc_index_min_32u_a_H
58 #define INCLUDED_volk_32fc_index_min_32u_a_H
66 #include <immintrin.h>
69 static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
73 const __m256i indices_increment = _mm256_set1_epi32(8);
79 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
81 __m256 min_values = _mm256_set1_ps(FLT_MAX);
82 __m256i min_indices = _mm256_setzero_si256();
84 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
85 __m256 in0 = _mm256_load_ps((
float*)source);
86 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
88 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
95 _mm256_store_ps(min_values_buffer, min_values);
96 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
100 for (
unsigned i = 0;
i < 8;
i++) {
101 if (min_values_buffer[
i] < min) {
102 min = min_values_buffer[
i];
103 index = min_indices_buffer[
i];
108 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
109 const float abs_squared =
111 if (abs_squared < min) {
124 #include <immintrin.h>
127 static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
131 const __m256i indices_increment = _mm256_set1_epi32(8);
137 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
139 __m256 min_values = _mm256_set1_ps(FLT_MAX);
140 __m256i min_indices = _mm256_setzero_si256();
142 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
143 __m256 in0 = _mm256_load_ps((
float*)source);
144 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
146 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
153 _mm256_store_ps(min_values_buffer, min_values);
154 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
158 for (
unsigned i = 0;
i < 8;
i++) {
159 if (min_values_buffer[
i] < min) {
160 min = min_values_buffer[
i];
161 index = min_indices_buffer[
i];
166 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
167 const float abs_squared =
169 if (abs_squared < min) {
182 #include <pmmintrin.h>
183 #include <xmmintrin.h>
195 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
207 int bound = num_points >> 2;
209 for (
int i = 0;
i < bound; ++
i) {
233 if (num_points >> 1 & 1) {
260 if (num_points & 1) {
284 target[0] = holderi.
i[0];
285 sq_dist = holderf.
f[0];
286 target[0] = (holderf.
f[1] < sq_dist) ? holderi.
i[1] : target[0];
287 sq_dist = (holderf.
f[1] < sq_dist) ? holderf.
f[1] : sq_dist;
288 target[0] = (holderf.
f[2] < sq_dist) ? holderi.
i[2] : target[0];
289 sq_dist = (holderf.
f[2] < sq_dist) ? holderf.
f[2] : sq_dist;
290 target[0] = (holderf.
f[3] < sq_dist) ? holderi.
i[3] : target[0];
291 sq_dist = (holderf.
f[3] < sq_dist) ? holderf.
f[3] : sq_dist;
296 #ifdef LV_HAVE_GENERIC
305 for (uint32_t
i = 0;
i < num_points; ++
i) {
321 #ifndef INCLUDED_volk_32fc_index_min_32u_u_H
322 #define INCLUDED_volk_32fc_index_min_32u_u_H
324 #include <inttypes.h>
330 #include <immintrin.h>
333 static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
337 const __m256i indices_increment = _mm256_set1_epi32(8);
343 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
345 __m256 min_values = _mm256_set1_ps(FLT_MAX);
346 __m256i min_indices = _mm256_setzero_si256();
348 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
349 __m256 in0 = _mm256_loadu_ps((
float*)source);
350 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
352 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
359 _mm256_store_ps(min_values_buffer, min_values);
360 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
364 for (
unsigned i = 0;
i < 8;
i++) {
365 if (min_values_buffer[
i] < min) {
366 min = min_values_buffer[
i];
367 index = min_indices_buffer[
i];
372 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
373 const float abs_squared =
375 if (abs_squared < min) {
388 #include <immintrin.h>
391 static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
395 const __m256i indices_increment = _mm256_set1_epi32(8);
401 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
403 __m256 min_values = _mm256_set1_ps(FLT_MAX);
404 __m256i min_indices = _mm256_setzero_si256();
406 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
407 __m256 in0 = _mm256_loadu_ps((
float*)source);
408 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
410 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
417 _mm256_store_ps(min_values_buffer, min_values);
418 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
422 for (
unsigned i = 0;
i < 8;
i++) {
423 if (min_values_buffer[
i] < min) {
424 min = min_values_buffer[
i];
425 index = min_indices_buffer[
i];
430 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
431 const float abs_squared =
433 if (abs_squared < min) {
446 #include <arm_neon.h>
453 const uint32_t quarter_points = num_points / 4;
456 uint32_t indices[4] = { 0, 1, 2, 3 };
457 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
458 uint32x4_t vec_indices = vld1q_u32(indices);
459 uint32x4_t vec_min_indices = vec_indices;
465 float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
467 for (uint32_t number = 0; number < quarter_points; number++) {
469 const float32x4_t vec_mag2 =
473 const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
474 vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
475 vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
476 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
478 uint32_t tmp_min_indices[4];
480 vst1q_u32(tmp_min_indices, vec_min_indices);
481 vst1q_f32(tmp_min, vec_min);
483 for (
int i = 0;
i < 4;
i++) {
484 if (tmp_min[
i] < min) {
486 index = tmp_min_indices[
i];
491 for (uint32_t number = quarter_points * 4; number < num_points; number++) {
492 const float re =
lv_creal(*sourcePtr);
493 const float im =
lv_cimag(*sourcePtr);
494 const float sq_dist = re * re + im * im;
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:5278
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1118
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2110
Definition: volk_common.h:120
float f[4]
Definition: volk_common.h:124
__m128i int_vec
Definition: volk_common.h:132
uint32_t i[4]
Definition: volk_common.h:123
__m128 float_vec
Definition: volk_common.h:128
static void volk_32fc_index_min_32u_generic(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:297
static void volk_32fc_index_min_32u_a_sse3(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:185
static void volk_32fc_index_min_32u_neon(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:449
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:238
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:300
#define bit128_p(x)
Definition: volk_common.h:151
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73