57 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
58 #define INCLUDED_volk_32fc_index_max_32u_a_H
66 #include <immintrin.h>
69 static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
73 const __m256i indices_increment = _mm256_set1_epi32(8);
79 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
81 __m256 max_values = _mm256_setzero_ps();
82 __m256i max_indices = _mm256_setzero_si256();
84 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
85 __m256 in0 = _mm256_load_ps((
float*)src0);
86 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
88 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
95 _mm256_store_ps(max_values_buffer, max_values);
96 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
100 for (
unsigned i = 0;
i < 8;
i++) {
101 if (max_values_buffer[
i] > max) {
102 max = max_values_buffer[
i];
103 index = max_indices_buffer[
i];
108 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
109 const float abs_squared =
111 if (abs_squared > max) {
124 #include <immintrin.h>
127 static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
131 const __m256i indices_increment = _mm256_set1_epi32(8);
137 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
139 __m256 max_values = _mm256_setzero_ps();
140 __m256i max_indices = _mm256_setzero_si256();
142 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
143 __m256 in0 = _mm256_load_ps((
float*)src0);
144 __m256 in1 = _mm256_load_ps((
float*)(src0 + 4));
146 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
153 _mm256_store_ps(max_values_buffer, max_values);
154 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
158 for (
unsigned i = 0;
i < 8;
i++) {
159 if (max_values_buffer[
i] > max) {
160 max = max_values_buffer[
i];
161 index = max_indices_buffer[
i];
166 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
167 const float abs_squared =
169 if (abs_squared > max) {
182 #include <pmmintrin.h>
183 #include <xmmintrin.h>
188 const uint32_t num_bytes = num_points * 8;
196 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
203 int bound = num_bytes >> 5;
211 for (;
i < bound; ++
i) {
235 if (num_bytes >> 4 & 1) {
262 if (num_bytes >> 3 & 1) {
286 target[0] = holderi.
i[0];
287 sq_dist = holderf.
f[0];
288 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
289 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
290 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
291 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
292 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
293 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
298 #ifdef LV_HAVE_GENERIC
302 const uint32_t num_bytes = num_points * 8;
310 for (; i<num_bytes>> 3; ++
i) {
326 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
327 #define INCLUDED_volk_32fc_index_max_32u_u_H
329 #include <inttypes.h>
335 #include <immintrin.h>
338 static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
342 const __m256i indices_increment = _mm256_set1_epi32(8);
348 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
350 __m256 max_values = _mm256_setzero_ps();
351 __m256i max_indices = _mm256_setzero_si256();
353 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
354 __m256 in0 = _mm256_loadu_ps((
float*)src0);
355 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
357 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
364 _mm256_store_ps(max_values_buffer, max_values);
365 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
369 for (
unsigned i = 0;
i < 8;
i++) {
370 if (max_values_buffer[
i] > max) {
371 max = max_values_buffer[
i];
372 index = max_indices_buffer[
i];
377 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
378 const float abs_squared =
380 if (abs_squared > max) {
393 #include <immintrin.h>
396 static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
400 const __m256i indices_increment = _mm256_set1_epi32(8);
406 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
408 __m256 max_values = _mm256_setzero_ps();
409 __m256i max_indices = _mm256_setzero_si256();
411 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
412 __m256 in0 = _mm256_loadu_ps((
float*)src0);
413 __m256 in1 = _mm256_loadu_ps((
float*)(src0 + 4));
415 in0, in1, &max_values, &max_indices, ¤t_indices, indices_increment);
422 _mm256_store_ps(max_values_buffer, max_values);
423 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
427 for (
unsigned i = 0;
i < 8;
i++) {
428 if (max_values_buffer[
i] > max) {
429 max = max_values_buffer[
i];
430 index = max_indices_buffer[
i];
435 for (
unsigned i = num_points & (~7u);
i < num_points; ++
i) {
436 const float abs_squared =
438 if (abs_squared > max) {
451 #include <arm_neon.h>
457 unsigned int number = 0;
458 const uint32_t quarter_points = num_points / 4;
461 uint32_t indices[4] = { 0, 1, 2, 3 };
462 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
463 uint32x4_t vec_indices = vld1q_u32(indices);
464 uint32x4_t vec_max_indices = vec_indices;
470 float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
472 for (; number < quarter_points; number++) {
474 const float32x4_t vec_mag2 =
478 const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
479 vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
480 vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
481 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
483 uint32_t tmp_max_indices[4];
485 vst1q_u32(tmp_max_indices, vec_max_indices);
486 vst1q_f32(tmp_max, vec_max);
488 for (
int i = 0;
i < 4;
i++) {
489 if (tmp_max[
i] > max) {
491 index = tmp_max_indices[
i];
496 for (number = quarter_points * 4; number < num_points; number++) {
497 const float re =
lv_creal(*src0Ptr);
498 const float im =
lv_cimag(*src0Ptr);
499 const float sq_dist = re * re + im * im;
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:5278
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2055
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1118
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
Definition: volk_common.h:120
float f[4]
Definition: volk_common.h:124
__m128i int_vec
Definition: volk_common.h:132
uint32_t i[4]
Definition: volk_common.h:123
__m128 float_vec
Definition: volk_common.h:128
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:300
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:186
static void volk_32fc_index_max_32u_neon(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:455
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:188
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:126
#define bit128_p(x)
Definition: volk_common.h:151
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73