Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
58 #define INCLUDED_volk_32fc_index_max_32u_a_H
59 
60 #include <inttypes.h>
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 #include <volk/volk_complex.h>
64 
65 #ifdef LV_HAVE_AVX2
66 #include <immintrin.h>
68 
69 static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
70  lv_32fc_t* src0,
71  uint32_t num_points)
72 {
73  const __m256i indices_increment = _mm256_set1_epi32(8);
74  /*
75  * At the start of each loop iteration current_indices holds the indices of
76  * the complex numbers loaded from memory. Explanation for odd order is given
77  * in implementation of vector_32fc_index_max_variant0().
78  */
79  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
80 
81  __m256 max_values = _mm256_setzero_ps();
82  __m256i max_indices = _mm256_setzero_si256();
83 
84  for (unsigned i = 0; i < num_points / 8u; ++i) {
85  __m256 in0 = _mm256_load_ps((float*)src0);
86  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
88  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
89  src0 += 8;
90  }
91 
92  // determine maximum value and index in the result of the vectorized loop
93  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
94  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
95  _mm256_store_ps(max_values_buffer, max_values);
96  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
97 
98  float max = 0.f;
99  uint32_t index = 0;
100  for (unsigned i = 0; i < 8; i++) {
101  if (max_values_buffer[i] > max) {
102  max = max_values_buffer[i];
103  index = max_indices_buffer[i];
104  }
105  }
106 
107  // handle tail not processed by the vectorized loop
108  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
109  const float abs_squared =
110  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
111  if (abs_squared > max) {
112  max = abs_squared;
113  index = i;
114  }
115  ++src0;
116  }
117 
118  *target = index;
119 }
120 
121 #endif /*LV_HAVE_AVX2*/
122 
123 #ifdef LV_HAVE_AVX2
124 #include <immintrin.h>
126 
127 static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
128  lv_32fc_t* src0,
129  uint32_t num_points)
130 {
131  const __m256i indices_increment = _mm256_set1_epi32(8);
132  /*
133  * At the start of each loop iteration current_indices holds the indices of
134  * the complex numbers loaded from memory. Explanation for odd order is given
135  * in implementation of vector_32fc_index_max_variant0().
136  */
137  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
138 
139  __m256 max_values = _mm256_setzero_ps();
140  __m256i max_indices = _mm256_setzero_si256();
141 
142  for (unsigned i = 0; i < num_points / 8u; ++i) {
143  __m256 in0 = _mm256_load_ps((float*)src0);
144  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
146  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
147  src0 += 8;
148  }
149 
150  // determine maximum value and index in the result of the vectorized loop
151  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
152  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
153  _mm256_store_ps(max_values_buffer, max_values);
154  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
155 
156  float max = 0.f;
157  uint32_t index = 0;
158  for (unsigned i = 0; i < 8; i++) {
159  if (max_values_buffer[i] > max) {
160  max = max_values_buffer[i];
161  index = max_indices_buffer[i];
162  }
163  }
164 
165  // handle tail not processed by the vectorized loop
166  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
167  const float abs_squared =
168  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
169  if (abs_squared > max) {
170  max = abs_squared;
171  index = i;
172  }
173  ++src0;
174  }
175 
176  *target = index;
177 }
178 
179 #endif /*LV_HAVE_AVX2*/
180 
181 #ifdef LV_HAVE_SSE3
182 #include <pmmintrin.h>
183 #include <xmmintrin.h>
184 
185 static inline void
186 volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
187 {
188  const uint32_t num_bytes = num_points * 8;
189 
190  union bit128 holderf;
191  union bit128 holderi;
192  float sq_dist = 0.0;
193 
194  union bit128 xmm5, xmm4;
195  __m128 xmm1, xmm2, xmm3;
196  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
197 
198  xmm5.int_vec = _mm_setzero_si128();
199  xmm4.int_vec = _mm_setzero_si128();
200  holderf.int_vec = _mm_setzero_si128();
201  holderi.int_vec = _mm_setzero_si128();
202 
203  int bound = num_bytes >> 5;
204  int i = 0;
205 
206  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
207  xmm9 = _mm_setzero_si128();
208  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
209  xmm3 = _mm_setzero_ps();
210 
211  for (; i < bound; ++i) {
212  xmm1 = _mm_load_ps((float*)src0);
213  xmm2 = _mm_load_ps((float*)&src0[2]);
214 
215  src0 += 4;
216 
217  xmm1 = _mm_mul_ps(xmm1, xmm1);
218  xmm2 = _mm_mul_ps(xmm2, xmm2);
219 
220  xmm1 = _mm_hadd_ps(xmm1, xmm2);
221 
222  xmm3 = _mm_max_ps(xmm1, xmm3);
223 
224  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
225  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
226 
227  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
228  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
229 
230  xmm9 = _mm_add_epi32(xmm11, xmm12);
231 
232  xmm8 = _mm_add_epi32(xmm8, xmm10);
233  }
234 
235  if (num_bytes >> 4 & 1) {
236  xmm2 = _mm_load_ps((float*)src0);
237 
238  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
239  xmm8 = bit128_p(&xmm1)->int_vec;
240 
241  xmm2 = _mm_mul_ps(xmm2, xmm2);
242 
243  src0 += 2;
244 
245  xmm1 = _mm_hadd_ps(xmm2, xmm2);
246 
247  xmm3 = _mm_max_ps(xmm1, xmm3);
248 
249  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
250 
251  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
252  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
253 
254  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
255  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
256 
257  xmm9 = _mm_add_epi32(xmm11, xmm12);
258 
259  xmm8 = _mm_add_epi32(xmm8, xmm10);
260  }
261 
262  if (num_bytes >> 3 & 1) {
263  sq_dist =
264  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
265 
266  xmm2 = _mm_load1_ps(&sq_dist);
267 
268  xmm1 = xmm3;
269 
270  xmm3 = _mm_max_ss(xmm3, xmm2);
271 
272  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
273  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
274 
275  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
276 
277  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
278  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
279 
280  xmm9 = _mm_add_epi32(xmm11, xmm12);
281  }
282 
283  _mm_store_ps((float*)&(holderf.f), xmm3);
284  _mm_store_si128(&(holderi.int_vec), xmm9);
285 
286  target[0] = holderi.i[0];
287  sq_dist = holderf.f[0];
288  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
289  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
290  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
291  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
292  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
293  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
294 }
295 
296 #endif /*LV_HAVE_SSE3*/
297 
298 #ifdef LV_HAVE_GENERIC
299 static inline void
300 volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
301 {
302  const uint32_t num_bytes = num_points * 8;
303 
304  float sq_dist = 0.0;
305  float max = 0.0;
306  uint32_t index = 0;
307 
308  uint32_t i = 0;
309 
310  for (; i<num_bytes>> 3; ++i) {
311  sq_dist =
312  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
313 
314  if (sq_dist > max) {
315  index = i;
316  max = sq_dist;
317  }
318  }
319  target[0] = index;
320 }
321 
322 #endif /*LV_HAVE_GENERIC*/
323 
324 #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
325 
326 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
327 #define INCLUDED_volk_32fc_index_max_32u_u_H
328 
329 #include <inttypes.h>
330 #include <stdio.h>
331 #include <volk/volk_common.h>
332 #include <volk/volk_complex.h>
333 
334 #ifdef LV_HAVE_AVX2
335 #include <immintrin.h>
337 
338 static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
339  lv_32fc_t* src0,
340  uint32_t num_points)
341 {
342  const __m256i indices_increment = _mm256_set1_epi32(8);
343  /*
344  * At the start of each loop iteration current_indices holds the indices of
345  * the complex numbers loaded from memory. Explanation for odd order is given
346  * in implementation of vector_32fc_index_max_variant0().
347  */
348  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
349 
350  __m256 max_values = _mm256_setzero_ps();
351  __m256i max_indices = _mm256_setzero_si256();
352 
353  for (unsigned i = 0; i < num_points / 8u; ++i) {
354  __m256 in0 = _mm256_loadu_ps((float*)src0);
355  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
357  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
358  src0 += 8;
359  }
360 
361  // determine maximum value and index in the result of the vectorized loop
362  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
363  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
364  _mm256_store_ps(max_values_buffer, max_values);
365  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
366 
367  float max = 0.f;
368  uint32_t index = 0;
369  for (unsigned i = 0; i < 8; i++) {
370  if (max_values_buffer[i] > max) {
371  max = max_values_buffer[i];
372  index = max_indices_buffer[i];
373  }
374  }
375 
376  // handle tail not processed by the vectorized loop
377  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
378  const float abs_squared =
379  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
380  if (abs_squared > max) {
381  max = abs_squared;
382  index = i;
383  }
384  ++src0;
385  }
386 
387  *target = index;
388 }
389 
390 #endif /*LV_HAVE_AVX2*/
391 
392 #ifdef LV_HAVE_AVX2
393 #include <immintrin.h>
395 
396 static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
397  lv_32fc_t* src0,
398  uint32_t num_points)
399 {
400  const __m256i indices_increment = _mm256_set1_epi32(8);
401  /*
402  * At the start of each loop iteration current_indices holds the indices of
403  * the complex numbers loaded from memory. Explanation for odd order is given
404  * in implementation of vector_32fc_index_max_variant0().
405  */
406  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
407 
408  __m256 max_values = _mm256_setzero_ps();
409  __m256i max_indices = _mm256_setzero_si256();
410 
411  for (unsigned i = 0; i < num_points / 8u; ++i) {
412  __m256 in0 = _mm256_loadu_ps((float*)src0);
413  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
415  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
416  src0 += 8;
417  }
418 
419  // determine maximum value and index in the result of the vectorized loop
420  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
421  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
422  _mm256_store_ps(max_values_buffer, max_values);
423  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
424 
425  float max = 0.f;
426  uint32_t index = 0;
427  for (unsigned i = 0; i < 8; i++) {
428  if (max_values_buffer[i] > max) {
429  max = max_values_buffer[i];
430  index = max_indices_buffer[i];
431  }
432  }
433 
434  // handle tail not processed by the vectorized loop
435  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
436  const float abs_squared =
437  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
438  if (abs_squared > max) {
439  max = abs_squared;
440  index = i;
441  }
442  ++src0;
443  }
444 
445  *target = index;
446 }
447 
448 #endif /*LV_HAVE_AVX2*/
449 
450 #ifdef LV_HAVE_NEON
451 #include <arm_neon.h>
453 
454 static inline void
455 volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
456 {
457  unsigned int number = 0;
458  const uint32_t quarter_points = num_points / 4;
459  const lv_32fc_t* src0Ptr = src0;
460 
461  uint32_t indices[4] = { 0, 1, 2, 3 };
462  const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
463  uint32x4_t vec_indices = vld1q_u32(indices);
464  uint32x4_t vec_max_indices = vec_indices;
465 
466  if (num_points) {
467  float max = FLT_MIN;
468  uint32_t index = 0;
469 
470  float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
471 
472  for (; number < quarter_points; number++) {
473  // Load complex and compute magnitude squared
474  const float32x4_t vec_mag2 =
475  _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
476  __VOLK_PREFETCH(src0Ptr += 4);
477  // a > b?
478  const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
479  vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
480  vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
481  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
482  }
483  uint32_t tmp_max_indices[4];
484  float tmp_max[4];
485  vst1q_u32(tmp_max_indices, vec_max_indices);
486  vst1q_f32(tmp_max, vec_max);
487 
488  for (int i = 0; i < 4; i++) {
489  if (tmp_max[i] > max) {
490  max = tmp_max[i];
491  index = tmp_max_indices[i];
492  }
493  }
494 
495  // Deal with the rest
496  for (number = quarter_points * 4; number < num_points; number++) {
497  const float re = lv_creal(*src0Ptr);
498  const float im = lv_cimag(*src0Ptr);
499  const float sq_dist = re * re + im * im;
500  if (sq_dist > max) {
501  max = sq_dist;
502  index = number;
503  }
504  src0Ptr++;
505  }
506  *target = index;
507  }
508 }
509 
510 #endif /*LV_HAVE_NEON*/
511 
512 #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:5278
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2055
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1118
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
Definition: volk_common.h:120
float f[4]
Definition: volk_common.h:124
__m128i int_vec
Definition: volk_common.h:132
uint32_t i[4]
Definition: volk_common.h:123
__m128 float_vec
Definition: volk_common.h:128
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:300
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:186
static void volk_32fc_index_max_32u_neon(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:455
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:188
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:126
#define bit128_p(x)
Definition: volk_common.h:151
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73