Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_index_min_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #ifndef INCLUDED_volk_32fc_index_min_32u_a_H
58 #define INCLUDED_volk_32fc_index_min_32u_a_H
59 
60 #include <inttypes.h>
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 #include <volk/volk_complex.h>
64 
65 #ifdef LV_HAVE_AVX2
66 #include <immintrin.h>
68 
69 static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
70  const lv_32fc_t* source,
71  uint32_t num_points)
72 {
73  const __m256i indices_increment = _mm256_set1_epi32(8);
74  /*
75  * At the start of each loop iteration current_indices holds the indices of
76  * the complex numbers loaded from memory. Explanation for odd order is given
77  * in implementation of vector_32fc_index_min_variant0().
78  */
79  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80 
81  __m256 min_values = _mm256_set1_ps(FLT_MAX);
82  __m256i min_indices = _mm256_setzero_si256();
83 
84  for (unsigned i = 0; i < num_points / 8u; ++i) {
85  __m256 in0 = _mm256_load_ps((float*)source);
86  __m256 in1 = _mm256_load_ps((float*)(source + 4));
88  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
89  source += 8;
90  }
91 
92  // determine minimum value and index in the result of the vectorized loop
93  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
94  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
95  _mm256_store_ps(min_values_buffer, min_values);
96  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
97 
98  float min = FLT_MAX;
99  uint32_t index = 0;
100  for (unsigned i = 0; i < 8; i++) {
101  if (min_values_buffer[i] < min) {
102  min = min_values_buffer[i];
103  index = min_indices_buffer[i];
104  }
105  }
106 
107  // handle tail not processed by the vectorized loop
108  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
109  const float abs_squared =
110  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
111  if (abs_squared < min) {
112  min = abs_squared;
113  index = i;
114  }
115  ++source;
116  }
117 
118  *target = index;
119 }
120 
121 #endif /*LV_HAVE_AVX2*/
122 
123 #ifdef LV_HAVE_AVX2
124 #include <immintrin.h>
126 
127 static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
128  const lv_32fc_t* source,
129  uint32_t num_points)
130 {
131  const __m256i indices_increment = _mm256_set1_epi32(8);
132  /*
133  * At the start of each loop iteration current_indices holds the indices of
134  * the complex numbers loaded from memory. Explanation for odd order is given
135  * in implementation of vector_32fc_index_min_variant0().
136  */
137  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138 
139  __m256 min_values = _mm256_set1_ps(FLT_MAX);
140  __m256i min_indices = _mm256_setzero_si256();
141 
142  for (unsigned i = 0; i < num_points / 8u; ++i) {
143  __m256 in0 = _mm256_load_ps((float*)source);
144  __m256 in1 = _mm256_load_ps((float*)(source + 4));
146  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
147  source += 8;
148  }
149 
150  // determine minimum value and index in the result of the vectorized loop
151  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
152  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
153  _mm256_store_ps(min_values_buffer, min_values);
154  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
155 
156  float min = FLT_MAX;
157  uint32_t index = 0;
158  for (unsigned i = 0; i < 8; i++) {
159  if (min_values_buffer[i] < min) {
160  min = min_values_buffer[i];
161  index = min_indices_buffer[i];
162  }
163  }
164 
165  // handle tail not processed by the vectorized loop
166  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
167  const float abs_squared =
168  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
169  if (abs_squared < min) {
170  min = abs_squared;
171  index = i;
172  }
173  ++source;
174  }
175 
176  *target = index;
177 }
178 
179 #endif /*LV_HAVE_AVX2*/
180 
181 #ifdef LV_HAVE_SSE3
182 #include <pmmintrin.h>
183 #include <xmmintrin.h>
184 
185 static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
186  const lv_32fc_t* source,
187  uint32_t num_points)
188 {
189  union bit128 holderf;
190  union bit128 holderi;
191  float sq_dist = 0.0;
192 
193  union bit128 xmm5, xmm4;
194  __m128 xmm1, xmm2, xmm3;
195  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
196 
197  xmm5.int_vec = _mm_setzero_si128();
198  xmm4.int_vec = _mm_setzero_si128();
199  holderf.int_vec = _mm_setzero_si128();
200  holderi.int_vec = _mm_setzero_si128();
201 
202  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
203  xmm9 = _mm_setzero_si128();
204  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
205  xmm3 = _mm_set_ps1(FLT_MAX);
206 
207  int bound = num_points >> 2;
208 
209  for (int i = 0; i < bound; ++i) {
210  xmm1 = _mm_load_ps((float*)source);
211  xmm2 = _mm_load_ps((float*)&source[2]);
212 
213  source += 4;
214 
215  xmm1 = _mm_mul_ps(xmm1, xmm1);
216  xmm2 = _mm_mul_ps(xmm2, xmm2);
217 
218  xmm1 = _mm_hadd_ps(xmm1, xmm2);
219 
220  xmm3 = _mm_min_ps(xmm1, xmm3);
221 
222  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
223  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
224 
225  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
226  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
227 
228  xmm9 = _mm_add_epi32(xmm11, xmm12);
229 
230  xmm8 = _mm_add_epi32(xmm8, xmm10);
231  }
232 
233  if (num_points >> 1 & 1) {
234  xmm2 = _mm_load_ps((float*)source);
235 
236  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
237  xmm8 = bit128_p(&xmm1)->int_vec;
238 
239  xmm2 = _mm_mul_ps(xmm2, xmm2);
240 
241  source += 2;
242 
243  xmm1 = _mm_hadd_ps(xmm2, xmm2);
244 
245  xmm3 = _mm_min_ps(xmm1, xmm3);
246 
247  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
248 
249  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
250  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
251 
252  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
253  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
254 
255  xmm9 = _mm_add_epi32(xmm11, xmm12);
256 
257  xmm8 = _mm_add_epi32(xmm8, xmm10);
258  }
259 
260  if (num_points & 1) {
261  sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
262  lv_cimag(source[0]) * lv_cimag(source[0]);
263 
264  xmm2 = _mm_load1_ps(&sq_dist);
265 
266  xmm1 = xmm3;
267 
268  xmm3 = _mm_min_ss(xmm3, xmm2);
269 
270  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
271  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
272 
273  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
274 
275  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
276  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
277 
278  xmm9 = _mm_add_epi32(xmm11, xmm12);
279  }
280 
281  _mm_store_ps((float*)&(holderf.f), xmm3);
282  _mm_store_si128(&(holderi.int_vec), xmm9);
283 
284  target[0] = holderi.i[0];
285  sq_dist = holderf.f[0];
286  target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
287  sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
288  target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
289  sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
290  target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
291  sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
292 }
293 
294 #endif /*LV_HAVE_SSE3*/
295 
296 #ifdef LV_HAVE_GENERIC
297 static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
298  const lv_32fc_t* source,
299  uint32_t num_points)
300 {
301  float sq_dist = 0.0;
302  float min = FLT_MAX;
303  uint32_t index = 0;
304 
305  for (uint32_t i = 0; i < num_points; ++i) {
306  sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
307  lv_cimag(source[i]) * lv_cimag(source[i]);
308 
309  if (sq_dist < min) {
310  index = i;
311  min = sq_dist;
312  }
313  }
314  target[0] = index;
315 }
316 
317 #endif /*LV_HAVE_GENERIC*/
318 
319 #endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
320 
321 #ifndef INCLUDED_volk_32fc_index_min_32u_u_H
322 #define INCLUDED_volk_32fc_index_min_32u_u_H
323 
324 #include <inttypes.h>
325 #include <stdio.h>
326 #include <volk/volk_common.h>
327 #include <volk/volk_complex.h>
328 
329 #ifdef LV_HAVE_AVX2
330 #include <immintrin.h>
332 
333 static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
334  const lv_32fc_t* source,
335  uint32_t num_points)
336 {
337  const __m256i indices_increment = _mm256_set1_epi32(8);
338  /*
339  * At the start of each loop iteration current_indices holds the indices of
340  * the complex numbers loaded from memory. Explanation for odd order is given
341  * in implementation of vector_32fc_index_min_variant0().
342  */
343  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
344 
345  __m256 min_values = _mm256_set1_ps(FLT_MAX);
346  __m256i min_indices = _mm256_setzero_si256();
347 
348  for (unsigned i = 0; i < num_points / 8u; ++i) {
349  __m256 in0 = _mm256_loadu_ps((float*)source);
350  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
352  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
353  source += 8;
354  }
355 
356  // determine minimum value and index in the result of the vectorized loop
357  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
358  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
359  _mm256_store_ps(min_values_buffer, min_values);
360  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
361 
362  float min = FLT_MAX;
363  uint32_t index = 0;
364  for (unsigned i = 0; i < 8; i++) {
365  if (min_values_buffer[i] < min) {
366  min = min_values_buffer[i];
367  index = min_indices_buffer[i];
368  }
369  }
370 
371  // handle tail not processed by the vectorized loop
372  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
373  const float abs_squared =
374  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
375  if (abs_squared < min) {
376  min = abs_squared;
377  index = i;
378  }
379  ++source;
380  }
381 
382  *target = index;
383 }
384 
385 #endif /*LV_HAVE_AVX2*/
386 
387 #ifdef LV_HAVE_AVX2
388 #include <immintrin.h>
390 
391 static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
392  const lv_32fc_t* source,
393  uint32_t num_points)
394 {
395  const __m256i indices_increment = _mm256_set1_epi32(8);
396  /*
397  * At the start of each loop iteration current_indices holds the indices of
398  * the complex numbers loaded from memory. Explanation for odd order is given
399  * in implementation of vector_32fc_index_min_variant0().
400  */
401  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
402 
403  __m256 min_values = _mm256_set1_ps(FLT_MAX);
404  __m256i min_indices = _mm256_setzero_si256();
405 
406  for (unsigned i = 0; i < num_points / 8u; ++i) {
407  __m256 in0 = _mm256_loadu_ps((float*)source);
408  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
410  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
411  source += 8;
412  }
413 
414  // determine minimum value and index in the result of the vectorized loop
415  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
416  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
417  _mm256_store_ps(min_values_buffer, min_values);
418  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
419 
420  float min = FLT_MAX;
421  uint32_t index = 0;
422  for (unsigned i = 0; i < 8; i++) {
423  if (min_values_buffer[i] < min) {
424  min = min_values_buffer[i];
425  index = min_indices_buffer[i];
426  }
427  }
428 
429  // handle tail not processed by the vectorized loop
430  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
431  const float abs_squared =
432  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
433  if (abs_squared < min) {
434  min = abs_squared;
435  index = i;
436  }
437  ++source;
438  }
439 
440  *target = index;
441 }
442 
443 #endif /*LV_HAVE_AVX2*/
444 
445 #ifdef LV_HAVE_NEON
446 #include <arm_neon.h>
448 
449 static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
450  const lv_32fc_t* source,
451  uint32_t num_points)
452 {
453  const uint32_t quarter_points = num_points / 4;
454  const lv_32fc_t* sourcePtr = source;
455 
456  uint32_t indices[4] = { 0, 1, 2, 3 };
457  const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
458  uint32x4_t vec_indices = vld1q_u32(indices);
459  uint32x4_t vec_min_indices = vec_indices;
460 
461  if (num_points) {
462  float min = FLT_MAX;
463  uint32_t index = 0;
464 
465  float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
466 
467  for (uint32_t number = 0; number < quarter_points; number++) {
468  // Load complex and compute magnitude squared
469  const float32x4_t vec_mag2 =
470  _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
471  __VOLK_PREFETCH(sourcePtr += 4);
472  // a < b?
473  const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
474  vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
475  vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
476  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
477  }
478  uint32_t tmp_min_indices[4];
479  float tmp_min[4];
480  vst1q_u32(tmp_min_indices, vec_min_indices);
481  vst1q_f32(tmp_min, vec_min);
482 
483  for (int i = 0; i < 4; i++) {
484  if (tmp_min[i] < min) {
485  min = tmp_min[i];
486  index = tmp_min_indices[i];
487  }
488  }
489 
490  // Deal with the rest
491  for (uint32_t number = quarter_points * 4; number < num_points; number++) {
492  const float re = lv_creal(*sourcePtr);
493  const float im = lv_cimag(*sourcePtr);
494  const float sq_dist = re * re + im * im;
495  if (sq_dist < min) {
496  min = sq_dist;
497  index = number;
498  }
499  sourcePtr++;
500  }
501  *target = index;
502  }
503 }
504 
505 #endif /*LV_HAVE_NEON*/
506 
507 #endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:5278
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1118
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2110
Definition: volk_common.h:120
float f[4]
Definition: volk_common.h:124
__m128i int_vec
Definition: volk_common.h:132
uint32_t i[4]
Definition: volk_common.h:123
__m128 float_vec
Definition: volk_common.h:128
static void volk_32fc_index_min_32u_generic(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:297
static void volk_32fc_index_min_32u_a_sse3(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:185
static void volk_32fc_index_min_32u_neon(uint32_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_32u.h:449
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:238
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:300
#define bit128_p(x)
Definition: volk_common.h:151
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73