Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_index_min_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
63 #ifndef INCLUDED_volk_32fc_index_min_16u_a_H
64 #define INCLUDED_volk_32fc_index_min_16u_a_H
65 
66 #include <inttypes.h>
67 #include <limits.h>
68 #include <stdio.h>
69 #include <volk/volk_common.h>
70 #include <volk/volk_complex.h>
71 
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
75 
76 static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
77  const lv_32fc_t* source,
78  uint32_t num_points)
79 {
80  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81 
82  const __m256i indices_increment = _mm256_set1_epi32(8);
83  /*
84  * At the start of each loop iteration current_indices holds the indices of
85  * the complex numbers loaded from memory. Explanation for odd order is given
86  * in implementation of vector_32fc_index_min_variant0().
87  */
88  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89 
90  __m256 min_values = _mm256_set1_ps(FLT_MAX);
91  __m256i min_indices = _mm256_setzero_si256();
92 
93  for (unsigned i = 0; i < num_points / 8u; ++i) {
94  __m256 in0 = _mm256_load_ps((float*)source);
95  __m256 in1 = _mm256_load_ps((float*)(source + 4));
97  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
98  source += 8;
99  }
100 
101  // determine minimum value and index in the result of the vectorized loop
102  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
103  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
104  _mm256_store_ps(min_values_buffer, min_values);
105  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
106 
107  float min = FLT_MAX;
108  uint32_t index = 0;
109  for (unsigned i = 0; i < 8; i++) {
110  if (min_values_buffer[i] < min) {
111  min = min_values_buffer[i];
112  index = min_indices_buffer[i];
113  }
114  }
115 
116  // handle tail not processed by the vectorized loop
117  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118  const float abs_squared =
119  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
120  if (abs_squared < min) {
121  min = abs_squared;
122  index = i;
123  }
124  ++source;
125  }
126 
127  *target = index;
128 }
129 
130 #endif /*LV_HAVE_AVX2*/
131 
132 #ifdef LV_HAVE_AVX2
133 #include <immintrin.h>
135 
136 static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
137  const lv_32fc_t* source,
138  uint32_t num_points)
139 {
140  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141 
142  const __m256i indices_increment = _mm256_set1_epi32(8);
143  /*
144  * At the start of each loop iteration current_indices holds the indices of
145  * the complex numbers loaded from memory. Explanation for odd order is given
146  * in implementation of vector_32fc_index_min_variant0().
147  */
148  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149 
150  __m256 min_values = _mm256_set1_ps(FLT_MAX);
151  __m256i min_indices = _mm256_setzero_si256();
152 
153  for (unsigned i = 0; i < num_points / 8u; ++i) {
154  __m256 in0 = _mm256_load_ps((float*)source);
155  __m256 in1 = _mm256_load_ps((float*)(source + 4));
157  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
158  source += 8;
159  }
160 
161  // determine minimum value and index in the result of the vectorized loop
162  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
163  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
164  _mm256_store_ps(min_values_buffer, min_values);
165  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
166 
167  float min = FLT_MAX;
168  uint32_t index = 0;
169  for (unsigned i = 0; i < 8; i++) {
170  if (min_values_buffer[i] < min) {
171  min = min_values_buffer[i];
172  index = min_indices_buffer[i];
173  }
174  }
175 
176  // handle tail not processed by the vectorized loop
177  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178  const float abs_squared =
179  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
180  if (abs_squared < min) {
181  min = abs_squared;
182  index = i;
183  }
184  ++source;
185  }
186 
187  *target = index;
188 }
189 
190 #endif /*LV_HAVE_AVX2*/
191 
192 #ifdef LV_HAVE_SSE3
193 #include <pmmintrin.h>
194 #include <xmmintrin.h>
195 
196 static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
197  const lv_32fc_t* source,
198  uint32_t num_points)
199 {
200  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
201 
202  union bit128 holderf;
203  union bit128 holderi;
204  float sq_dist = 0.0;
205 
206  union bit128 xmm5, xmm4;
207  __m128 xmm1, xmm2, xmm3;
208  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209 
210  xmm5.int_vec = _mm_setzero_si128();
211  xmm4.int_vec = _mm_setzero_si128();
212  holderf.int_vec = _mm_setzero_si128();
213  holderi.int_vec = _mm_setzero_si128();
214 
215  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216  xmm9 = _mm_setzero_si128();
217  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218  xmm3 = _mm_set_ps1(FLT_MAX);
219 
220  int bound = num_points >> 2;
221 
222  for (int i = 0; i < bound; ++i) {
223  xmm1 = _mm_load_ps((float*)source);
224  xmm2 = _mm_load_ps((float*)&source[2]);
225 
226  source += 4;
227 
228  xmm1 = _mm_mul_ps(xmm1, xmm1);
229  xmm2 = _mm_mul_ps(xmm2, xmm2);
230 
231  xmm1 = _mm_hadd_ps(xmm1, xmm2);
232 
233  xmm3 = _mm_min_ps(xmm1, xmm3);
234 
235  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
237 
238  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
239  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
240 
241  xmm9 = _mm_add_epi32(xmm11, xmm12);
242 
243  xmm8 = _mm_add_epi32(xmm8, xmm10);
244  }
245 
246  if (num_points >> 1 & 1) {
247  xmm2 = _mm_load_ps((float*)source);
248 
249  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
250  xmm8 = bit128_p(&xmm1)->int_vec;
251 
252  xmm2 = _mm_mul_ps(xmm2, xmm2);
253 
254  source += 2;
255 
256  xmm1 = _mm_hadd_ps(xmm2, xmm2);
257 
258  xmm3 = _mm_min_ps(xmm1, xmm3);
259 
260  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
261 
262  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
264 
265  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
266  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
267 
268  xmm9 = _mm_add_epi32(xmm11, xmm12);
269 
270  xmm8 = _mm_add_epi32(xmm8, xmm10);
271  }
272 
273  if (num_points & 1) {
274  sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
275  lv_cimag(source[0]) * lv_cimag(source[0]);
276 
277  xmm2 = _mm_load1_ps(&sq_dist);
278 
279  xmm1 = xmm3;
280 
281  xmm3 = _mm_min_ss(xmm3, xmm2);
282 
283  xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
285 
286  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
287 
288  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
289  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
290 
291  xmm9 = _mm_add_epi32(xmm11, xmm12);
292  }
293 
294  _mm_store_ps((float*)&(holderf.f), xmm3);
295  _mm_store_si128(&(holderi.int_vec), xmm9);
296 
297  target[0] = holderi.i[0];
298  sq_dist = holderf.f[0];
299  target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
300  sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
301  target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
302  sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
303  target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
304  sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
305 }
306 
307 #endif /*LV_HAVE_SSE3*/
308 
309 #ifdef LV_HAVE_GENERIC
310 static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
311  const lv_32fc_t* source,
312  uint32_t num_points)
313 {
314  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315 
316  float sq_dist = 0.0;
317  float min = FLT_MAX;
318  uint16_t index = 0;
319 
320  for (uint32_t i = 0; i < num_points; ++i) {
321  sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
322  lv_cimag(source[i]) * lv_cimag(source[i]);
323 
324  if (sq_dist < min) {
325  index = i;
326  min = sq_dist;
327  }
328  }
329  target[0] = index;
330 }
331 
332 #endif /*LV_HAVE_GENERIC*/
333 
334 #endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
335 
336 #ifndef INCLUDED_volk_32fc_index_min_16u_u_H
337 #define INCLUDED_volk_32fc_index_min_16u_u_H
338 
339 #include <inttypes.h>
340 #include <limits.h>
341 #include <stdio.h>
342 #include <volk/volk_common.h>
343 #include <volk/volk_complex.h>
344 
345 #ifdef LV_HAVE_AVX2
346 #include <immintrin.h>
348 
349 static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
350  const lv_32fc_t* source,
351  uint32_t num_points)
352 {
353  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
354 
355  const __m256i indices_increment = _mm256_set1_epi32(8);
356  /*
357  * At the start of each loop iteration current_indices holds the indices of
358  * the complex numbers loaded from memory. Explanation for odd order is given
359  * in implementation of vector_32fc_index_min_variant0().
360  */
361  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
362 
363  __m256 min_values = _mm256_set1_ps(FLT_MAX);
364  __m256i min_indices = _mm256_setzero_si256();
365 
366  for (unsigned i = 0; i < num_points / 8u; ++i) {
367  __m256 in0 = _mm256_loadu_ps((float*)source);
368  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
370  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
371  source += 8;
372  }
373 
374  // determine minimum value and index in the result of the vectorized loop
375  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
376  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
377  _mm256_store_ps(min_values_buffer, min_values);
378  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
379 
380  float min = FLT_MAX;
381  uint32_t index = 0;
382  for (unsigned i = 0; i < 8; i++) {
383  if (min_values_buffer[i] < min) {
384  min = min_values_buffer[i];
385  index = min_indices_buffer[i];
386  }
387  }
388 
389  // handle tail not processed by the vectorized loop
390  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
391  const float abs_squared =
392  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
393  if (abs_squared < min) {
394  min = abs_squared;
395  index = i;
396  }
397  ++source;
398  }
399 
400  *target = index;
401 }
402 
403 #endif /*LV_HAVE_AVX2*/
404 
405 #ifdef LV_HAVE_AVX2
406 #include <immintrin.h>
408 
409 static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
410  const lv_32fc_t* source,
411  uint32_t num_points)
412 {
413  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
414 
415  const __m256i indices_increment = _mm256_set1_epi32(8);
416  /*
417  * At the start of each loop iteration current_indices holds the indices of
418  * the complex numbers loaded from memory. Explanation for odd order is given
419  * in implementation of vector_32fc_index_min_variant0().
420  */
421  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
422 
423  __m256 min_values = _mm256_set1_ps(FLT_MAX);
424  __m256i min_indices = _mm256_setzero_si256();
425 
426  for (unsigned i = 0; i < num_points / 8u; ++i) {
427  __m256 in0 = _mm256_loadu_ps((float*)source);
428  __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
430  in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
431  source += 8;
432  }
433 
434  // determine minimum value and index in the result of the vectorized loop
435  __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
436  __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
437  _mm256_store_ps(min_values_buffer, min_values);
438  _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
439 
440  float min = FLT_MAX;
441  uint32_t index = 0;
442  for (unsigned i = 0; i < 8; i++) {
443  if (min_values_buffer[i] < min) {
444  min = min_values_buffer[i];
445  index = min_indices_buffer[i];
446  }
447  }
448 
449  // handle tail not processed by the vectorized loop
450  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
451  const float abs_squared =
452  lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
453  if (abs_squared < min) {
454  min = abs_squared;
455  index = i;
456  }
457  ++source;
458  }
459 
460  *target = index;
461 }
462 
463 #endif /*LV_HAVE_AVX2*/
464 
465 #endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:5278
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1118
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2080
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2110
Definition: volk_common.h:120
float f[4]
Definition: volk_common.h:124
__m128i int_vec
Definition: volk_common.h:132
uint32_t i[4]
Definition: volk_common.h:123
__m128 float_vec
Definition: volk_common.h:128
static void volk_32fc_index_min_16u_generic(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:310
static void volk_32fc_index_min_16u_a_sse3(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:196
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:238
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:300
#define bit128_p(x)
Definition: volk_common.h:151
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13