68 #ifndef INCLUDED_volk_32i_x2_and_32i_a_H
69 #define INCLUDED_volk_32i_x2_and_32i_a_H
74 #ifdef LV_HAVE_AVX512F
75 #include <immintrin.h>
77 static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
78 const int32_t* aVector,
79 const int32_t* bVector,
80 unsigned int num_points)
82 unsigned int number = 0;
83 const unsigned int sixteenthPoints = num_points / 16;
85 int32_t* cPtr = (int32_t*)cVector;
86 const int32_t* aPtr = (int32_t*)aVector;
87 const int32_t* bPtr = (int32_t*)bVector;
89 __m512i aVal, bVal, cVal;
90 for (; number < sixteenthPoints; number++) {
92 aVal = _mm512_load_si512(aPtr);
93 bVal = _mm512_load_si512(bPtr);
95 cVal = _mm512_and_si512(aVal, bVal);
97 _mm512_store_si512(cPtr, cVal);
104 number = sixteenthPoints * 16;
105 for (; number < num_points; number++) {
106 cVector[number] = aVector[number] & bVector[number];
112 #include <immintrin.h>
114 static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
115 const int32_t* aVector,
116 const int32_t* bVector,
117 unsigned int num_points)
119 unsigned int number = 0;
120 const unsigned int oneEightPoints = num_points / 8;
122 int32_t* cPtr = cVector;
123 const int32_t* aPtr = aVector;
124 const int32_t* bPtr = bVector;
126 __m256i aVal, bVal, cVal;
127 for (; number < oneEightPoints; number++) {
129 aVal = _mm256_load_si256((__m256i*)aPtr);
130 bVal = _mm256_load_si256((__m256i*)bPtr);
132 cVal = _mm256_and_si256(aVal, bVal);
134 _mm256_store_si256((__m256i*)cPtr,
142 number = oneEightPoints * 8;
143 for (; number < num_points; number++) {
144 cVector[number] = aVector[number] & bVector[number];
151 #include <xmmintrin.h>
154 const int32_t* aVector,
155 const int32_t* bVector,
156 unsigned int num_points)
158 unsigned int number = 0;
159 const unsigned int quarterPoints = num_points / 4;
161 float* cPtr = (
float*)cVector;
162 const float* aPtr = (
float*)aVector;
163 const float* bPtr = (
float*)bVector;
166 for (; number < quarterPoints; number++) {
180 number = quarterPoints * 4;
181 for (; number < num_points; number++) {
182 cVector[number] = aVector[number] & bVector[number];
189 #include <arm_neon.h>
192 const int32_t* aVector,
193 const int32_t* bVector,
194 unsigned int num_points)
196 int32_t* cPtr = cVector;
197 const int32_t* aPtr = aVector;
198 const int32_t* bPtr = bVector;
199 unsigned int number = 0;
200 unsigned int quarter_points = num_points / 4;
202 int32x4_t a_val, b_val, c_val;
204 for (number = 0; number < quarter_points; number++) {
205 a_val = vld1q_s32(aPtr);
206 b_val = vld1q_s32(bPtr);
207 c_val = vandq_s32(a_val, b_val);
208 vst1q_s32(cPtr, c_val);
214 for (number = quarter_points * 4; number < num_points; number++) {
215 *cPtr++ = (*aPtr++) & (*bPtr++);
221 #ifdef LV_HAVE_GENERIC
224 const int32_t* aVector,
225 const int32_t* bVector,
226 unsigned int num_points)
228 int32_t* cPtr = cVector;
229 const int32_t* aPtr = aVector;
230 const int32_t* bPtr = bVector;
231 unsigned int number = 0;
233 for (number = 0; number < num_points; number++) {
234 *cPtr++ = (*aPtr++) & (*bPtr++);
241 extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
242 const int32_t* aVector,
243 const int32_t* bVector,
244 unsigned int num_points);
246 static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
247 const int32_t* aVector,
248 const int32_t* bVector,
249 unsigned int num_points)
251 volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
259 #ifndef INCLUDED_volk_32i_x2_and_32i_u_H
260 #define INCLUDED_volk_32i_x2_and_32i_u_H
262 #include <inttypes.h>
265 #ifdef LV_HAVE_AVX512F
266 #include <immintrin.h>
268 static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
269 const int32_t* aVector,
270 const int32_t* bVector,
271 unsigned int num_points)
273 unsigned int number = 0;
274 const unsigned int sixteenthPoints = num_points / 16;
276 int32_t* cPtr = (int32_t*)cVector;
277 const int32_t* aPtr = (int32_t*)aVector;
278 const int32_t* bPtr = (int32_t*)bVector;
280 __m512i aVal, bVal, cVal;
281 for (; number < sixteenthPoints; number++) {
283 aVal = _mm512_loadu_si512(aPtr);
284 bVal = _mm512_loadu_si512(bPtr);
286 cVal = _mm512_and_si512(aVal, bVal);
288 _mm512_storeu_si512(cPtr, cVal);
295 number = sixteenthPoints * 16;
296 for (; number < num_points; number++) {
297 cVector[number] = aVector[number] & bVector[number];
303 #include <immintrin.h>
305 static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
306 const int32_t* aVector,
307 const int32_t* bVector,
308 unsigned int num_points)
310 unsigned int number = 0;
311 const unsigned int oneEightPoints = num_points / 8;
313 int32_t* cPtr = cVector;
314 const int32_t* aPtr = aVector;
315 const int32_t* bPtr = bVector;
317 __m256i aVal, bVal, cVal;
318 for (; number < oneEightPoints; number++) {
320 aVal = _mm256_loadu_si256((__m256i*)aPtr);
321 bVal = _mm256_loadu_si256((__m256i*)bPtr);
323 cVal = _mm256_and_si256(aVal, bVal);
325 _mm256_storeu_si256((__m256i*)cPtr,
333 number = oneEightPoints * 8;
334 for (; number < num_points; number++) {
335 cVector[number] = aVector[number] & bVector[number];
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32i_x2_and_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:153
static void volk_32i_x2_and_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:223
static void volk_32i_x2_and_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:191