Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
63 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
64 #define INCLUDED_volk_32fc_index_max_16u_a_H
65 
66 #include <inttypes.h>
67 #include <limits.h>
68 #include <stdio.h>
69 #include <volk/volk_common.h>
70 #include <volk/volk_complex.h>
71 
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
75 
76 static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
77  lv_32fc_t* src0,
78  uint32_t num_points)
79 {
80  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
81 
82  const __m256i indices_increment = _mm256_set1_epi32(8);
83  /*
84  * At the start of each loop iteration current_indices holds the indices of
85  * the complex numbers loaded from memory. Explanation for odd order is given
86  * in implementation of vector_32fc_index_max_variant0().
87  */
88  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
89 
90  __m256 max_values = _mm256_setzero_ps();
91  __m256i max_indices = _mm256_setzero_si256();
92 
93  for (unsigned i = 0; i < num_points / 8u; ++i) {
94  __m256 in0 = _mm256_load_ps((float*)src0);
95  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
97  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
98  src0 += 8;
99  }
100 
101  // determine maximum value and index in the result of the vectorized loop
102  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
103  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
104  _mm256_store_ps(max_values_buffer, max_values);
105  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
106 
107  float max = 0.f;
108  uint32_t index = 0;
109  for (unsigned i = 0; i < 8; i++) {
110  if (max_values_buffer[i] > max) {
111  max = max_values_buffer[i];
112  index = max_indices_buffer[i];
113  }
114  }
115 
116  // handle tail not processed by the vectorized loop
117  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
118  const float abs_squared =
119  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
120  if (abs_squared > max) {
121  max = abs_squared;
122  index = i;
123  }
124  ++src0;
125  }
126 
127  *target = index;
128 }
129 
130 #endif /*LV_HAVE_AVX2*/
131 
132 #ifdef LV_HAVE_AVX2
133 #include <immintrin.h>
135 
136 static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
137  lv_32fc_t* src0,
138  uint32_t num_points)
139 {
140  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
141 
142  const __m256i indices_increment = _mm256_set1_epi32(8);
143  /*
144  * At the start of each loop iteration current_indices holds the indices of
145  * the complex numbers loaded from memory. Explanation for odd order is given
146  * in implementation of vector_32fc_index_max_variant0().
147  */
148  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
149 
150  __m256 max_values = _mm256_setzero_ps();
151  __m256i max_indices = _mm256_setzero_si256();
152 
153  for (unsigned i = 0; i < num_points / 8u; ++i) {
154  __m256 in0 = _mm256_load_ps((float*)src0);
155  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
157  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
158  src0 += 8;
159  }
160 
161  // determine maximum value and index in the result of the vectorized loop
162  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
163  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
164  _mm256_store_ps(max_values_buffer, max_values);
165  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
166 
167  float max = 0.f;
168  uint32_t index = 0;
169  for (unsigned i = 0; i < 8; i++) {
170  if (max_values_buffer[i] > max) {
171  max = max_values_buffer[i];
172  index = max_indices_buffer[i];
173  }
174  }
175 
176  // handle tail not processed by the vectorized loop
177  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
178  const float abs_squared =
179  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
180  if (abs_squared > max) {
181  max = abs_squared;
182  index = i;
183  }
184  ++src0;
185  }
186 
187  *target = index;
188 }
189 
190 #endif /*LV_HAVE_AVX2*/
191 
192 #ifdef LV_HAVE_SSE3
193 #include <pmmintrin.h>
194 #include <xmmintrin.h>
195 
196 static inline void
197 volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
198 {
199  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
200  const uint32_t num_bytes = num_points * 8;
201 
202  union bit128 holderf;
203  union bit128 holderi;
204  float sq_dist = 0.0;
205 
206  union bit128 xmm5, xmm4;
207  __m128 xmm1, xmm2, xmm3;
208  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
209 
210  xmm5.int_vec = _mm_setzero_si128();
211  xmm4.int_vec = _mm_setzero_si128();
212  holderf.int_vec = _mm_setzero_si128();
213  holderi.int_vec = _mm_setzero_si128();
214 
215  int bound = num_bytes >> 5;
216  int i = 0;
217 
218  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
219  xmm9 = _mm_setzero_si128();
220  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
221  xmm3 = _mm_setzero_ps();
222 
223  for (; i < bound; ++i) {
224  xmm1 = _mm_load_ps((float*)src0);
225  xmm2 = _mm_load_ps((float*)&src0[2]);
226 
227  src0 += 4;
228 
229  xmm1 = _mm_mul_ps(xmm1, xmm1);
230  xmm2 = _mm_mul_ps(xmm2, xmm2);
231 
232  xmm1 = _mm_hadd_ps(xmm1, xmm2);
233 
234  xmm3 = _mm_max_ps(xmm1, xmm3);
235 
236  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
237  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
238 
239  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
240  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
241 
242  xmm9 = _mm_add_epi32(xmm11, xmm12);
243 
244  xmm8 = _mm_add_epi32(xmm8, xmm10);
245  }
246 
247  if (num_bytes >> 4 & 1) {
248  xmm2 = _mm_load_ps((float*)src0);
249 
250  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
251  xmm8 = bit128_p(&xmm1)->int_vec;
252 
253  xmm2 = _mm_mul_ps(xmm2, xmm2);
254 
255  src0 += 2;
256 
257  xmm1 = _mm_hadd_ps(xmm2, xmm2);
258 
259  xmm3 = _mm_max_ps(xmm1, xmm3);
260 
261  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
262 
263  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
264  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
265 
266  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
267  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
268 
269  xmm9 = _mm_add_epi32(xmm11, xmm12);
270 
271  xmm8 = _mm_add_epi32(xmm8, xmm10);
272  }
273 
274  if (num_bytes >> 3 & 1) {
275  sq_dist =
276  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
277 
278  xmm2 = _mm_load1_ps(&sq_dist);
279 
280  xmm1 = xmm3;
281 
282  xmm3 = _mm_max_ss(xmm3, xmm2);
283 
284  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
285  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
286 
287  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
288 
289  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
290  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
291 
292  xmm9 = _mm_add_epi32(xmm11, xmm12);
293  }
294 
295  _mm_store_ps((float*)&(holderf.f), xmm3);
296  _mm_store_si128(&(holderi.int_vec), xmm9);
297 
298  target[0] = holderi.i[0];
299  sq_dist = holderf.f[0];
300  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
301  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
302  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
303  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
304  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
305  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
306 }
307 
308 #endif /*LV_HAVE_SSE3*/
309 
310 #ifdef LV_HAVE_GENERIC
311 static inline void
312 volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
313 {
314  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
315 
316  const uint32_t num_bytes = num_points * 8;
317 
318  float sq_dist = 0.0;
319  float max = 0.0;
320  uint16_t index = 0;
321 
322  uint32_t i = 0;
323 
324  for (; i<num_bytes>> 3; ++i) {
325  sq_dist =
326  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
327 
328  if (sq_dist > max) {
329  index = i;
330  max = sq_dist;
331  }
332  }
333  target[0] = index;
334 }
335 
336 #endif /*LV_HAVE_GENERIC*/
337 
338 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
339 
340 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
341 #define INCLUDED_volk_32fc_index_max_16u_u_H
342 
343 #include <inttypes.h>
344 #include <limits.h>
345 #include <stdio.h>
346 #include <volk/volk_common.h>
347 #include <volk/volk_complex.h>
348 
349 #ifdef LV_HAVE_AVX2
350 #include <immintrin.h>
352 
353 static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
354  lv_32fc_t* src0,
355  uint32_t num_points)
356 {
357  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
358 
359  const __m256i indices_increment = _mm256_set1_epi32(8);
360  /*
361  * At the start of each loop iteration current_indices holds the indices of
362  * the complex numbers loaded from memory. Explanation for odd order is given
363  * in implementation of vector_32fc_index_max_variant0().
364  */
365  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
366 
367  __m256 max_values = _mm256_setzero_ps();
368  __m256i max_indices = _mm256_setzero_si256();
369 
370  for (unsigned i = 0; i < num_points / 8u; ++i) {
371  __m256 in0 = _mm256_loadu_ps((float*)src0);
372  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
374  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
375  src0 += 8;
376  }
377 
378  // determine maximum value and index in the result of the vectorized loop
379  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
380  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
381  _mm256_store_ps(max_values_buffer, max_values);
382  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
383 
384  float max = 0.f;
385  uint32_t index = 0;
386  for (unsigned i = 0; i < 8; i++) {
387  if (max_values_buffer[i] > max) {
388  max = max_values_buffer[i];
389  index = max_indices_buffer[i];
390  }
391  }
392 
393  // handle tail not processed by the vectorized loop
394  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
395  const float abs_squared =
396  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
397  if (abs_squared > max) {
398  max = abs_squared;
399  index = i;
400  }
401  ++src0;
402  }
403 
404  *target = index;
405 }
406 
407 #endif /*LV_HAVE_AVX2*/
408 
409 #ifdef LV_HAVE_AVX2
410 #include <immintrin.h>
412 
413 static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
414  lv_32fc_t* src0,
415  uint32_t num_points)
416 {
417  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
418 
419  const __m256i indices_increment = _mm256_set1_epi32(8);
420  /*
421  * At the start of each loop iteration current_indices holds the indices of
422  * the complex numbers loaded from memory. Explanation for odd order is given
423  * in implementation of vector_32fc_index_max_variant0().
424  */
425  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
426 
427  __m256 max_values = _mm256_setzero_ps();
428  __m256i max_indices = _mm256_setzero_si256();
429 
430  for (unsigned i = 0; i < num_points / 8u; ++i) {
431  __m256 in0 = _mm256_loadu_ps((float*)src0);
432  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
434  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
435  src0 += 8;
436  }
437 
438  // determine maximum value and index in the result of the vectorized loop
439  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
440  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
441  _mm256_store_ps(max_values_buffer, max_values);
442  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
443 
444  float max = 0.f;
445  uint32_t index = 0;
446  for (unsigned i = 0; i < 8; i++) {
447  if (max_values_buffer[i] > max) {
448  max = max_values_buffer[i];
449  index = max_indices_buffer[i];
450  }
451  }
452 
453  // handle tail not processed by the vectorized loop
454  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
455  const float abs_squared =
456  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
457  if (abs_squared > max) {
458  max = abs_squared;
459  index = i;
460  }
461  ++src0;
462  }
463 
464  *target = index;
465 }
466 
467 #endif /*LV_HAVE_AVX2*/
468 
469 #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128i _mm_setzero_si128()
Definition: sse2neon.h:5339
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
Definition: sse2neon.h:5278
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
Definition: sse2neon.h:2055
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1118
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5358
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2025
Definition: volk_common.h:120
float f[4]
Definition: volk_common.h:124
__m128i int_vec
Definition: volk_common.h:132
uint32_t i[4]
Definition: volk_common.h:123
__m128 float_vec
Definition: volk_common.h:128
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:197
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:312
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:188
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:126
#define bit128_p(x)
Definition: volk_common.h:151
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13