Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32i_x2_and_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
68 #ifndef INCLUDED_volk_32i_x2_and_32i_a_H
69 #define INCLUDED_volk_32i_x2_and_32i_a_H
70 
71 #include <inttypes.h>
72 #include <stdio.h>
73 
74 #ifdef LV_HAVE_AVX512F
75 #include <immintrin.h>
76 
77 static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
78  const int32_t* aVector,
79  const int32_t* bVector,
80  unsigned int num_points)
81 {
82  unsigned int number = 0;
83  const unsigned int sixteenthPoints = num_points / 16;
84 
85  int32_t* cPtr = (int32_t*)cVector;
86  const int32_t* aPtr = (int32_t*)aVector;
87  const int32_t* bPtr = (int32_t*)bVector;
88 
89  __m512i aVal, bVal, cVal;
90  for (; number < sixteenthPoints; number++) {
91 
92  aVal = _mm512_load_si512(aPtr);
93  bVal = _mm512_load_si512(bPtr);
94 
95  cVal = _mm512_and_si512(aVal, bVal);
96 
97  _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
98 
99  aPtr += 16;
100  bPtr += 16;
101  cPtr += 16;
102  }
103 
104  number = sixteenthPoints * 16;
105  for (; number < num_points; number++) {
106  cVector[number] = aVector[number] & bVector[number];
107  }
108 }
109 #endif /* LV_HAVE_AVX512F */
110 
111 #ifdef LV_HAVE_AVX2
112 #include <immintrin.h>
113 
114 static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
115  const int32_t* aVector,
116  const int32_t* bVector,
117  unsigned int num_points)
118 {
119  unsigned int number = 0;
120  const unsigned int oneEightPoints = num_points / 8;
121 
122  int32_t* cPtr = cVector;
123  const int32_t* aPtr = aVector;
124  const int32_t* bPtr = bVector;
125 
126  __m256i aVal, bVal, cVal;
127  for (; number < oneEightPoints; number++) {
128 
129  aVal = _mm256_load_si256((__m256i*)aPtr);
130  bVal = _mm256_load_si256((__m256i*)bPtr);
131 
132  cVal = _mm256_and_si256(aVal, bVal);
133 
134  _mm256_store_si256((__m256i*)cPtr,
135  cVal); // Store the results back into the C container
136 
137  aPtr += 8;
138  bPtr += 8;
139  cPtr += 8;
140  }
141 
142  number = oneEightPoints * 8;
143  for (; number < num_points; number++) {
144  cVector[number] = aVector[number] & bVector[number];
145  }
146 }
147 #endif /* LV_HAVE_AVX2 */
148 
149 
150 #ifdef LV_HAVE_SSE
151 #include <xmmintrin.h>
152 
153 static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
154  const int32_t* aVector,
155  const int32_t* bVector,
156  unsigned int num_points)
157 {
158  unsigned int number = 0;
159  const unsigned int quarterPoints = num_points / 4;
160 
161  float* cPtr = (float*)cVector;
162  const float* aPtr = (float*)aVector;
163  const float* bPtr = (float*)bVector;
164 
165  __m128 aVal, bVal, cVal;
166  for (; number < quarterPoints; number++) {
167 
168  aVal = _mm_load_ps(aPtr);
169  bVal = _mm_load_ps(bPtr);
170 
171  cVal = _mm_and_ps(aVal, bVal);
172 
173  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
174 
175  aPtr += 4;
176  bPtr += 4;
177  cPtr += 4;
178  }
179 
180  number = quarterPoints * 4;
181  for (; number < num_points; number++) {
182  cVector[number] = aVector[number] & bVector[number];
183  }
184 }
185 #endif /* LV_HAVE_SSE */
186 
187 
188 #ifdef LV_HAVE_NEON
189 #include <arm_neon.h>
190 
191 static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
192  const int32_t* aVector,
193  const int32_t* bVector,
194  unsigned int num_points)
195 {
196  int32_t* cPtr = cVector;
197  const int32_t* aPtr = aVector;
198  const int32_t* bPtr = bVector;
199  unsigned int number = 0;
200  unsigned int quarter_points = num_points / 4;
201 
202  int32x4_t a_val, b_val, c_val;
203 
204  for (number = 0; number < quarter_points; number++) {
205  a_val = vld1q_s32(aPtr);
206  b_val = vld1q_s32(bPtr);
207  c_val = vandq_s32(a_val, b_val);
208  vst1q_s32(cPtr, c_val);
209  aPtr += 4;
210  bPtr += 4;
211  cPtr += 4;
212  }
213 
214  for (number = quarter_points * 4; number < num_points; number++) {
215  *cPtr++ = (*aPtr++) & (*bPtr++);
216  }
217 }
218 #endif /* LV_HAVE_NEON */
219 
220 
221 #ifdef LV_HAVE_GENERIC
222 
223 static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
224  const int32_t* aVector,
225  const int32_t* bVector,
226  unsigned int num_points)
227 {
228  int32_t* cPtr = cVector;
229  const int32_t* aPtr = aVector;
230  const int32_t* bPtr = bVector;
231  unsigned int number = 0;
232 
233  for (number = 0; number < num_points; number++) {
234  *cPtr++ = (*aPtr++) & (*bPtr++);
235  }
236 }
237 #endif /* LV_HAVE_GENERIC */
238 
239 
240 #ifdef LV_HAVE_ORC
241 extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
242  const int32_t* aVector,
243  const int32_t* bVector,
244  unsigned int num_points);
245 
246 static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
247  const int32_t* aVector,
248  const int32_t* bVector,
249  unsigned int num_points)
250 {
251  volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
252 }
253 #endif /* LV_HAVE_ORC */
254 
255 
256 #endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
257 
258 
259 #ifndef INCLUDED_volk_32i_x2_and_32i_u_H
260 #define INCLUDED_volk_32i_x2_and_32i_u_H
261 
262 #include <inttypes.h>
263 #include <stdio.h>
264 
265 #ifdef LV_HAVE_AVX512F
266 #include <immintrin.h>
267 
268 static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
269  const int32_t* aVector,
270  const int32_t* bVector,
271  unsigned int num_points)
272 {
273  unsigned int number = 0;
274  const unsigned int sixteenthPoints = num_points / 16;
275 
276  int32_t* cPtr = (int32_t*)cVector;
277  const int32_t* aPtr = (int32_t*)aVector;
278  const int32_t* bPtr = (int32_t*)bVector;
279 
280  __m512i aVal, bVal, cVal;
281  for (; number < sixteenthPoints; number++) {
282 
283  aVal = _mm512_loadu_si512(aPtr);
284  bVal = _mm512_loadu_si512(bPtr);
285 
286  cVal = _mm512_and_si512(aVal, bVal);
287 
288  _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
289 
290  aPtr += 16;
291  bPtr += 16;
292  cPtr += 16;
293  }
294 
295  number = sixteenthPoints * 16;
296  for (; number < num_points; number++) {
297  cVector[number] = aVector[number] & bVector[number];
298  }
299 }
300 #endif /* LV_HAVE_AVX512F */
301 
302 #ifdef LV_HAVE_AVX2
303 #include <immintrin.h>
304 
305 static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
306  const int32_t* aVector,
307  const int32_t* bVector,
308  unsigned int num_points)
309 {
310  unsigned int number = 0;
311  const unsigned int oneEightPoints = num_points / 8;
312 
313  int32_t* cPtr = cVector;
314  const int32_t* aPtr = aVector;
315  const int32_t* bPtr = bVector;
316 
317  __m256i aVal, bVal, cVal;
318  for (; number < oneEightPoints; number++) {
319 
320  aVal = _mm256_loadu_si256((__m256i*)aPtr);
321  bVal = _mm256_loadu_si256((__m256i*)bPtr);
322 
323  cVal = _mm256_and_si256(aVal, bVal);
324 
325  _mm256_storeu_si256((__m256i*)cPtr,
326  cVal); // Store the results back into the C container
327 
328  aPtr += 8;
329  bPtr += 8;
330  cPtr += 8;
331  }
332 
333  number = oneEightPoints * 8;
334  for (; number < num_points; number++) {
335  cVector[number] = aVector[number] & bVector[number];
336  }
337 }
338 #endif /* LV_HAVE_AVX2 */
339 
340 
341 #endif /* INCLUDED_volk_32i_x2_and_32i_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32i_x2_and_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:153
static void volk_32i_x2_and_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:223
static void volk_32i_x2_and_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:191