Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_binary_slicer_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
57 #ifndef INCLUDED_volk_32f_binary_slicer_32i_H
58 #define INCLUDED_volk_32f_binary_slicer_32i_H
59 
60 
61 #ifdef LV_HAVE_GENERIC
62 
63 static inline void volk_32f_binary_slicer_32i_generic(int* cVector,
64  const float* aVector,
65  unsigned int num_points)
66 {
67  int* cPtr = cVector;
68  const float* aPtr = aVector;
69  unsigned int number = 0;
70 
71  for (number = 0; number < num_points; number++) {
72  if (*aPtr++ >= 0) {
73  *cPtr++ = 1;
74  } else {
75  *cPtr++ = 0;
76  }
77  }
78 }
79 #endif /* LV_HAVE_GENERIC */
80 
81 
82 #ifdef LV_HAVE_GENERIC
83 
84 static inline void volk_32f_binary_slicer_32i_generic_branchless(int* cVector,
85  const float* aVector,
86  unsigned int num_points)
87 {
88  int* cPtr = cVector;
89  const float* aPtr = aVector;
90  unsigned int number = 0;
91 
92  for (number = 0; number < num_points; number++) {
93  *cPtr++ = (*aPtr++ >= 0);
94  }
95 }
96 #endif /* LV_HAVE_GENERIC */
97 
98 
99 #ifdef LV_HAVE_SSE2
100 #include <emmintrin.h>
101 
102 static inline void volk_32f_binary_slicer_32i_a_sse2(int* cVector,
103  const float* aVector,
104  unsigned int num_points)
105 {
106  int* cPtr = cVector;
107  const float* aPtr = aVector;
108  unsigned int number = 0;
109 
110  unsigned int quarter_points = num_points / 4;
111  __m128 a_val, res_f;
112  __m128i res_i, binary_i;
113  __m128 zero_val;
114  zero_val = _mm_set1_ps(0.0f);
115 
116  for (number = 0; number < quarter_points; number++) {
117  a_val = _mm_load_ps(aPtr);
118 
119  res_f = _mm_cmpge_ps(a_val, zero_val);
120  res_i = _mm_cvtps_epi32(res_f);
121  binary_i = _mm_srli_epi32(res_i, 31);
122 
123  _mm_store_si128((__m128i*)cPtr, binary_i);
124 
125  cPtr += 4;
126  aPtr += 4;
127  }
128 
129  for (number = quarter_points * 4; number < num_points; number++) {
130  if (*aPtr++ >= 0) {
131  *cPtr++ = 1;
132  } else {
133  *cPtr++ = 0;
134  }
135  }
136 }
137 #endif /* LV_HAVE_SSE2 */
138 
139 
140 #ifdef LV_HAVE_AVX
141 #include <immintrin.h>
142 
143 static inline void volk_32f_binary_slicer_32i_a_avx(int* cVector,
144  const float* aVector,
145  unsigned int num_points)
146 {
147  int* cPtr = cVector;
148  const float* aPtr = aVector;
149  unsigned int number = 0;
150 
151  unsigned int quarter_points = num_points / 8;
152  __m256 a_val, res_f, binary_f;
153  __m256i binary_i;
154  __m256 zero_val, one_val;
155  zero_val = _mm256_set1_ps(0.0f);
156  one_val = _mm256_set1_ps(1.0f);
157 
158  for (number = 0; number < quarter_points; number++) {
159  a_val = _mm256_load_ps(aPtr);
160 
161  res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
162  binary_f = _mm256_and_ps(res_f, one_val);
163  binary_i = _mm256_cvtps_epi32(binary_f);
164 
165  _mm256_store_si256((__m256i*)cPtr, binary_i);
166 
167  cPtr += 8;
168  aPtr += 8;
169  }
170 
171  for (number = quarter_points * 8; number < num_points; number++) {
172  if (*aPtr++ >= 0) {
173  *cPtr++ = 1;
174  } else {
175  *cPtr++ = 0;
176  }
177  }
178 }
179 #endif /* LV_HAVE_AVX */
180 
181 
182 #ifdef LV_HAVE_SSE2
183 #include <emmintrin.h>
184 
185 static inline void volk_32f_binary_slicer_32i_u_sse2(int* cVector,
186  const float* aVector,
187  unsigned int num_points)
188 {
189  int* cPtr = cVector;
190  const float* aPtr = aVector;
191  unsigned int number = 0;
192 
193  unsigned int quarter_points = num_points / 4;
194  __m128 a_val, res_f;
195  __m128i res_i, binary_i;
196  __m128 zero_val;
197  zero_val = _mm_set1_ps(0.0f);
198 
199  for (number = 0; number < quarter_points; number++) {
200  a_val = _mm_loadu_ps(aPtr);
201 
202  res_f = _mm_cmpge_ps(a_val, zero_val);
203  res_i = _mm_cvtps_epi32(res_f);
204  binary_i = _mm_srli_epi32(res_i, 31);
205 
206  _mm_storeu_si128((__m128i*)cPtr, binary_i);
207 
208  cPtr += 4;
209  aPtr += 4;
210  }
211 
212  for (number = quarter_points * 4; number < num_points; number++) {
213  if (*aPtr++ >= 0) {
214  *cPtr++ = 1;
215  } else {
216  *cPtr++ = 0;
217  }
218  }
219 }
220 #endif /* LV_HAVE_SSE2 */
221 
222 
223 #ifdef LV_HAVE_AVX
224 #include <immintrin.h>
225 
226 static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector,
227  const float* aVector,
228  unsigned int num_points)
229 {
230  int* cPtr = cVector;
231  const float* aPtr = aVector;
232  unsigned int number = 0;
233 
234  unsigned int quarter_points = num_points / 8;
235  __m256 a_val, res_f, binary_f;
236  __m256i binary_i;
237  __m256 zero_val, one_val;
238  zero_val = _mm256_set1_ps(0.0f);
239  one_val = _mm256_set1_ps(1.0f);
240 
241  for (number = 0; number < quarter_points; number++) {
242  a_val = _mm256_loadu_ps(aPtr);
243 
244  res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
245  binary_f = _mm256_and_ps(res_f, one_val);
246  binary_i = _mm256_cvtps_epi32(binary_f);
247 
248  _mm256_storeu_si256((__m256i*)cPtr, binary_i);
249 
250  cPtr += 8;
251  aPtr += 8;
252  }
253 
254  for (number = quarter_points * 8; number < num_points; number++) {
255  if (*aPtr++ >= 0) {
256  *cPtr++ = 1;
257  } else {
258  *cPtr++ = 0;
259  }
260  }
261 }
262 #endif /* LV_HAVE_AVX */
263 
264 
265 #endif /* INCLUDED_volk_32f_binary_slicer_32i_H */
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5838
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1133
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:6010
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_32f_binary_slicer_32i_generic(int *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_32i.h:63
static void volk_32f_binary_slicer_32i_generic_branchless(int *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_32i.h:84
static void volk_32f_binary_slicer_32i_u_sse2(int *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_32i.h:185
static void volk_32f_binary_slicer_32i_a_avx(int *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_32i.h:143
static void volk_32f_binary_slicer_32i_a_sse2(int *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_32i.h:102
static void volk_32f_binary_slicer_32i_u_avx(int *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_binary_slicer_32i.h:226