59 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
60 #define INCLUDED_volk_32f_binary_slicer_8i_H
63 #ifdef LV_HAVE_GENERIC
67 unsigned int num_points)
69 int8_t* cPtr = cVector;
70 const float* aPtr = aVector;
71 unsigned int number = 0;
73 for (number = 0; number < num_points; number++) {
84 #ifdef LV_HAVE_GENERIC
88 unsigned int num_points)
90 int8_t* cPtr = cVector;
91 const float* aPtr = aVector;
92 unsigned int number = 0;
94 for (number = 0; number < num_points; number++) {
95 *cPtr++ = (*aPtr++ >= 0);
102 #include <immintrin.h>
104 static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
105 const float* aVector,
106 unsigned int num_points)
108 int8_t* cPtr = cVector;
109 const float* aPtr = aVector;
110 unsigned int number = 0;
111 unsigned int n32points = num_points / 32;
113 const __m256 zero_val = _mm256_set1_ps(0.0f);
114 __m256 a0_val, a1_val, a2_val, a3_val;
115 __m256 res0_f, res1_f, res2_f, res3_f;
116 __m256i res0_i, res1_i, res2_i, res3_i;
117 __m256i byte_shuffle = _mm256_set_epi8(15,
150 for (number = 0; number < n32points; number++) {
151 a0_val = _mm256_load_ps(aPtr);
152 a1_val = _mm256_load_ps(aPtr + 8);
153 a2_val = _mm256_load_ps(aPtr + 16);
154 a3_val = _mm256_load_ps(aPtr + 24);
157 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
158 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
159 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
160 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
163 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
164 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
165 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
166 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
169 res0_i = _mm256_packs_epi32(res0_i, res1_i);
170 res2_i = _mm256_packs_epi32(res2_i, res3_i);
176 res0_i = _mm256_packs_epi16(res0_i, res2_i);
182 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
188 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
190 _mm256_store_si256((__m256i*)cPtr, res0_i);
195 for (number = n32points * 32; number < num_points; number++) {
206 #include <immintrin.h>
208 static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
209 const float* aVector,
210 unsigned int num_points)
212 int8_t* cPtr = cVector;
213 const float* aPtr = aVector;
214 unsigned int number = 0;
215 unsigned int n32points = num_points / 32;
217 const __m256 zero_val = _mm256_set1_ps(0.0f);
218 __m256 a0_val, a1_val, a2_val, a3_val;
219 __m256 res0_f, res1_f, res2_f, res3_f;
220 __m256i res0_i, res1_i, res2_i, res3_i;
221 __m256i byte_shuffle = _mm256_set_epi8(15,
254 for (number = 0; number < n32points; number++) {
255 a0_val = _mm256_loadu_ps(aPtr);
256 a1_val = _mm256_loadu_ps(aPtr + 8);
257 a2_val = _mm256_loadu_ps(aPtr + 16);
258 a3_val = _mm256_loadu_ps(aPtr + 24);
261 res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
262 res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
263 res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
264 res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
267 res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
268 res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
269 res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
270 res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
273 res0_i = _mm256_packs_epi32(res0_i, res1_i);
274 res2_i = _mm256_packs_epi32(res2_i, res3_i);
280 res0_i = _mm256_packs_epi16(res0_i, res2_i);
286 res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
292 res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
294 _mm256_storeu_si256((__m256i*)cPtr, res0_i);
299 for (number = n32points * 32; number < num_points; number++) {
312 #include <emmintrin.h>
315 const float* aVector,
316 unsigned int num_points)
318 int8_t* cPtr = cVector;
319 const float* aPtr = aVector;
320 unsigned int number = 0;
322 unsigned int n16points = num_points / 16;
323 __m128 a0_val, a1_val, a2_val, a3_val;
324 __m128 res0_f, res1_f, res2_f, res3_f;
325 __m128i res0_i, res1_i, res2_i, res3_i;
329 for (number = 0; number < n16points; number++) {
360 for (number = n16points * 16; number < num_points; number++) {
372 #include <emmintrin.h>
375 const float* aVector,
376 unsigned int num_points)
378 int8_t* cPtr = cVector;
379 const float* aPtr = aVector;
380 unsigned int number = 0;
382 unsigned int n16points = num_points / 16;
383 __m128 a0_val, a1_val, a2_val, a3_val;
384 __m128 res0_f, res1_f, res2_f, res3_f;
385 __m128i res0_i, res1_i, res2_i, res3_i;
389 for (number = 0; number < n16points; number++) {
420 for (number = n16points * 16; number < num_points; number++) {
432 #include <arm_neon.h>
435 const float* aVector,
436 unsigned int num_points)
438 int8_t* cPtr = cVector;
439 const float* aPtr = aVector;
440 unsigned int number = 0;
441 unsigned int n16points = num_points / 16;
443 float32x4x2_t input_val0, input_val1;
444 float32x4_t zero_val;
445 uint32x4x2_t res0_u32, res1_u32;
446 uint16x4x2_t res0_u16x4, res1_u16x4;
447 uint16x8x2_t res_u16x8;
451 zero_val = vdupq_n_f32(0.0);
452 one = vdup_n_u8(0x01);
457 for (number = 0; number < n16points; number++) {
458 input_val0 = vld2q_f32(aPtr);
459 input_val1 = vld2q_f32(aPtr + 8);
462 res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
463 res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
464 res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
465 res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
468 res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
469 res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
470 res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
471 res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
473 res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
474 res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
477 res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
478 res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
485 res_u8.val[0] = vand_u8(one, res_u8.val[0]);
486 res_u8.val[1] = vand_u8(one, res_u8.val[1]);
488 vst2_u8((
unsigned char*)cPtr, res_u8);
493 for (number = n16points * 16; number < num_points; number++) {
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1133
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:5030
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_32f_binary_slicer_8i_generic_branchless(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:86
static void volk_32f_binary_slicer_8i_a_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:314
static void volk_32f_binary_slicer_8i_u_sse2(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:374
static void volk_32f_binary_slicer_8i_neon(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:434
static void volk_32f_binary_slicer_8i_generic(int8_t *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_8i.h:65