Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_binary_slicer_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
59 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
60 #define INCLUDED_volk_32f_binary_slicer_8i_H
61 
62 
63 #ifdef LV_HAVE_GENERIC
64 
65 static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
66  const float* aVector,
67  unsigned int num_points)
68 {
69  int8_t* cPtr = cVector;
70  const float* aPtr = aVector;
71  unsigned int number = 0;
72 
73  for (number = 0; number < num_points; number++) {
74  if (*aPtr++ >= 0) {
75  *cPtr++ = 1;
76  } else {
77  *cPtr++ = 0;
78  }
79  }
80 }
81 #endif /* LV_HAVE_GENERIC */
82 
83 
84 #ifdef LV_HAVE_GENERIC
85 
86 static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
87  const float* aVector,
88  unsigned int num_points)
89 {
90  int8_t* cPtr = cVector;
91  const float* aPtr = aVector;
92  unsigned int number = 0;
93 
94  for (number = 0; number < num_points; number++) {
95  *cPtr++ = (*aPtr++ >= 0);
96  }
97 }
98 #endif /* LV_HAVE_GENERIC */
99 
100 
101 #ifdef LV_HAVE_AVX2
102 #include <immintrin.h>
103 
104 static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
105  const float* aVector,
106  unsigned int num_points)
107 {
108  int8_t* cPtr = cVector;
109  const float* aPtr = aVector;
110  unsigned int number = 0;
111  unsigned int n32points = num_points / 32;
112 
113  const __m256 zero_val = _mm256_set1_ps(0.0f);
114  __m256 a0_val, a1_val, a2_val, a3_val;
115  __m256 res0_f, res1_f, res2_f, res3_f;
116  __m256i res0_i, res1_i, res2_i, res3_i;
117  __m256i byte_shuffle = _mm256_set_epi8(15,
118  14,
119  13,
120  12,
121  7,
122  6,
123  5,
124  4,
125  11,
126  10,
127  9,
128  8,
129  3,
130  2,
131  1,
132  0,
133  15,
134  14,
135  13,
136  12,
137  7,
138  6,
139  5,
140  4,
141  11,
142  10,
143  9,
144  8,
145  3,
146  2,
147  1,
148  0);
149 
150  for (number = 0; number < n32points; number++) {
151  a0_val = _mm256_load_ps(aPtr);
152  a1_val = _mm256_load_ps(aPtr + 8);
153  a2_val = _mm256_load_ps(aPtr + 16);
154  a3_val = _mm256_load_ps(aPtr + 24);
155 
156  // compare >= 0; return float
157  res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
158  res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
159  res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
160  res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
161 
162  // convert to 32i and >> 31
163  res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
164  res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
165  res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
166  res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
167 
168  // pack in to 16-bit results
169  res0_i = _mm256_packs_epi32(res0_i, res1_i);
170  res2_i = _mm256_packs_epi32(res2_i, res3_i);
171  // pack in to 8-bit results
172  // res0: (after packs_epi32)
173  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
174  // res2:
175  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
176  res0_i = _mm256_packs_epi16(res0_i, res2_i);
177  // shuffle the lanes
178  // res0: (after packs_epi16)
179  // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
180  // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
181  // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
182  res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
183 
184  // shuffle bytes within lanes
185  // res0: (after shuffle_epi8)
186  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
187  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
188  res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
189 
190  _mm256_store_si256((__m256i*)cPtr, res0_i);
191  aPtr += 32;
192  cPtr += 32;
193  }
194 
195  for (number = n32points * 32; number < num_points; number++) {
196  if (*aPtr++ >= 0) {
197  *cPtr++ = 1;
198  } else {
199  *cPtr++ = 0;
200  }
201  }
202 }
203 #endif
204 
205 #ifdef LV_HAVE_AVX2
206 #include <immintrin.h>
207 
208 static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
209  const float* aVector,
210  unsigned int num_points)
211 {
212  int8_t* cPtr = cVector;
213  const float* aPtr = aVector;
214  unsigned int number = 0;
215  unsigned int n32points = num_points / 32;
216 
217  const __m256 zero_val = _mm256_set1_ps(0.0f);
218  __m256 a0_val, a1_val, a2_val, a3_val;
219  __m256 res0_f, res1_f, res2_f, res3_f;
220  __m256i res0_i, res1_i, res2_i, res3_i;
221  __m256i byte_shuffle = _mm256_set_epi8(15,
222  14,
223  13,
224  12,
225  7,
226  6,
227  5,
228  4,
229  11,
230  10,
231  9,
232  8,
233  3,
234  2,
235  1,
236  0,
237  15,
238  14,
239  13,
240  12,
241  7,
242  6,
243  5,
244  4,
245  11,
246  10,
247  9,
248  8,
249  3,
250  2,
251  1,
252  0);
253 
254  for (number = 0; number < n32points; number++) {
255  a0_val = _mm256_loadu_ps(aPtr);
256  a1_val = _mm256_loadu_ps(aPtr + 8);
257  a2_val = _mm256_loadu_ps(aPtr + 16);
258  a3_val = _mm256_loadu_ps(aPtr + 24);
259 
260  // compare >= 0; return float
261  res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
262  res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
263  res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
264  res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
265 
266  // convert to 32i and >> 31
267  res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
268  res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
269  res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
270  res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
271 
272  // pack in to 16-bit results
273  res0_i = _mm256_packs_epi32(res0_i, res1_i);
274  res2_i = _mm256_packs_epi32(res2_i, res3_i);
275  // pack in to 8-bit results
276  // res0: (after packs_epi32)
277  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
278  // res2:
279  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
280  res0_i = _mm256_packs_epi16(res0_i, res2_i);
281  // shuffle the lanes
282  // res0: (after packs_epi16)
283  // a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
284  // a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
285  // 0, 2, 1, 3 -> 11 01 10 00 (0xd8)
286  res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
287 
288  // shuffle bytes within lanes
289  // res0: (after shuffle_epi8)
290  // a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
291  // c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
292  res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
293 
294  _mm256_storeu_si256((__m256i*)cPtr, res0_i);
295  aPtr += 32;
296  cPtr += 32;
297  }
298 
299  for (number = n32points * 32; number < num_points; number++) {
300  if (*aPtr++ >= 0) {
301  *cPtr++ = 1;
302  } else {
303  *cPtr++ = 0;
304  }
305  }
306 }
307 #endif
308 
309 
310 #ifdef LV_HAVE_SSE2
311 
312 #include <emmintrin.h>
313 
314 static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
315  const float* aVector,
316  unsigned int num_points)
317 {
318  int8_t* cPtr = cVector;
319  const float* aPtr = aVector;
320  unsigned int number = 0;
321 
322  unsigned int n16points = num_points / 16;
323  __m128 a0_val, a1_val, a2_val, a3_val;
324  __m128 res0_f, res1_f, res2_f, res3_f;
325  __m128i res0_i, res1_i, res2_i, res3_i;
326  __m128 zero_val;
327  zero_val = _mm_set1_ps(0.0f);
328 
329  for (number = 0; number < n16points; number++) {
330  a0_val = _mm_load_ps(aPtr);
331  a1_val = _mm_load_ps(aPtr + 4);
332  a2_val = _mm_load_ps(aPtr + 8);
333  a3_val = _mm_load_ps(aPtr + 12);
334 
335  // compare >= 0; return float
336  res0_f = _mm_cmpge_ps(a0_val, zero_val);
337  res1_f = _mm_cmpge_ps(a1_val, zero_val);
338  res2_f = _mm_cmpge_ps(a2_val, zero_val);
339  res3_f = _mm_cmpge_ps(a3_val, zero_val);
340 
341  // convert to 32i and >> 31
342  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
343  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
344  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
345  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
346 
347  // pack into 16-bit results
348  res0_i = _mm_packs_epi32(res0_i, res1_i);
349  res2_i = _mm_packs_epi32(res2_i, res3_i);
350 
351  // pack into 8-bit results
352  res0_i = _mm_packs_epi16(res0_i, res2_i);
353 
354  _mm_store_si128((__m128i*)cPtr, res0_i);
355 
356  cPtr += 16;
357  aPtr += 16;
358  }
359 
360  for (number = n16points * 16; number < num_points; number++) {
361  if (*aPtr++ >= 0) {
362  *cPtr++ = 1;
363  } else {
364  *cPtr++ = 0;
365  }
366  }
367 }
368 #endif /* LV_HAVE_SSE2 */
369 
370 
371 #ifdef LV_HAVE_SSE2
372 #include <emmintrin.h>
373 
374 static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
375  const float* aVector,
376  unsigned int num_points)
377 {
378  int8_t* cPtr = cVector;
379  const float* aPtr = aVector;
380  unsigned int number = 0;
381 
382  unsigned int n16points = num_points / 16;
383  __m128 a0_val, a1_val, a2_val, a3_val;
384  __m128 res0_f, res1_f, res2_f, res3_f;
385  __m128i res0_i, res1_i, res2_i, res3_i;
386  __m128 zero_val;
387  zero_val = _mm_set1_ps(0.0f);
388 
389  for (number = 0; number < n16points; number++) {
390  a0_val = _mm_loadu_ps(aPtr);
391  a1_val = _mm_loadu_ps(aPtr + 4);
392  a2_val = _mm_loadu_ps(aPtr + 8);
393  a3_val = _mm_loadu_ps(aPtr + 12);
394 
395  // compare >= 0; return float
396  res0_f = _mm_cmpge_ps(a0_val, zero_val);
397  res1_f = _mm_cmpge_ps(a1_val, zero_val);
398  res2_f = _mm_cmpge_ps(a2_val, zero_val);
399  res3_f = _mm_cmpge_ps(a3_val, zero_val);
400 
401  // convert to 32i and >> 31
402  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
403  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
404  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
405  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
406 
407  // pack into 16-bit results
408  res0_i = _mm_packs_epi32(res0_i, res1_i);
409  res2_i = _mm_packs_epi32(res2_i, res3_i);
410 
411  // pack into 8-bit results
412  res0_i = _mm_packs_epi16(res0_i, res2_i);
413 
414  _mm_storeu_si128((__m128i*)cPtr, res0_i);
415 
416  cPtr += 16;
417  aPtr += 16;
418  }
419 
420  for (number = n16points * 16; number < num_points; number++) {
421  if (*aPtr++ >= 0) {
422  *cPtr++ = 1;
423  } else {
424  *cPtr++ = 0;
425  }
426  }
427 }
428 #endif /* LV_HAVE_SSE2 */
429 
430 
431 #ifdef LV_HAVE_NEON
432 #include <arm_neon.h>
433 
434 static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
435  const float* aVector,
436  unsigned int num_points)
437 {
438  int8_t* cPtr = cVector;
439  const float* aPtr = aVector;
440  unsigned int number = 0;
441  unsigned int n16points = num_points / 16;
442 
443  float32x4x2_t input_val0, input_val1;
444  float32x4_t zero_val;
445  uint32x4x2_t res0_u32, res1_u32;
446  uint16x4x2_t res0_u16x4, res1_u16x4;
447  uint16x8x2_t res_u16x8;
448  uint8x8x2_t res_u8;
449  uint8x8_t one;
450 
451  zero_val = vdupq_n_f32(0.0);
452  one = vdup_n_u8(0x01);
453 
454  // TODO: this is a good candidate for asm because the vcombines
455  // can be eliminated simply by picking dst registers that are
456  // adjacent.
457  for (number = 0; number < n16points; number++) {
458  input_val0 = vld2q_f32(aPtr);
459  input_val1 = vld2q_f32(aPtr + 8);
460 
461  // test against 0; return uint32
462  res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
463  res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
464  res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
465  res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
466 
467  // narrow uint32 -> uint16 followed by combine to 8-element vectors
468  res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
469  res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
470  res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
471  res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
472 
473  res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
474  res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
475 
476  // narrow uint16x8 -> uint8x8
477  res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
478  res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
479  // we *could* load twice as much data and do another vcombine here
480  // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
481  // but that turns out to be ~16% slower than this version on zc702
482  // it's possible register contention in GCC scheduler slows it down
483  // and a hand-written asm with quad-word u8 registers is much faster.
484 
485  res_u8.val[0] = vand_u8(one, res_u8.val[0]);
486  res_u8.val[1] = vand_u8(one, res_u8.val[1]);
487 
488  vst2_u8((unsigned char*)cPtr, res_u8);
489  cPtr += 16;
490  aPtr += 16;
491  }
492 
493  for (number = n16points * 16; number < num_points; number++) {
494  if (*aPtr++ >= 0) {
495  *cPtr++ = 1;
496  } else {
497  *cPtr++ = 0;
498  }
499  }
500 }
501 #endif /* LV_HAVE_NEON */
502 
503 
504 #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1133
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_32f_binary_slicer_8i_generic_branchless(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:86
static void volk_32f_binary_slicer_8i_a_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:314
static void volk_32f_binary_slicer_8i_u_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:374
static void volk_32f_binary_slicer_8i_neon(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:434
static void volk_32f_binary_slicer_8i_generic(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:65