Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32i_x2_or_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
68 #ifndef INCLUDED_volk_32i_x2_or_32i_a_H
69 #define INCLUDED_volk_32i_x2_or_32i_a_H
70 
71 #include <inttypes.h>
72 #include <stdio.h>
73 
74 #ifdef LV_HAVE_AVX512F
75 #include <immintrin.h>
76 
77 static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
78  const int32_t* aVector,
79  const int32_t* bVector,
80  unsigned int num_points)
81 {
82  unsigned int number = 0;
83  const unsigned int sixteenthPoints = num_points / 16;
84 
85  int32_t* cPtr = (int32_t*)cVector;
86  const int32_t* aPtr = (int32_t*)aVector;
87  const int32_t* bPtr = (int32_t*)bVector;
88 
89  __m512i aVal, bVal, cVal;
90  for (; number < sixteenthPoints; number++) {
91 
92  aVal = _mm512_load_si512(aPtr);
93  bVal = _mm512_load_si512(bPtr);
94 
95  cVal = _mm512_or_si512(aVal, bVal);
96 
97  _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
98 
99  aPtr += 16;
100  bPtr += 16;
101  cPtr += 16;
102  }
103 
104  number = sixteenthPoints * 16;
105  for (; number < num_points; number++) {
106  cVector[number] = aVector[number] | bVector[number];
107  }
108 }
109 #endif /* LV_HAVE_AVX512F */
110 
111 #ifdef LV_HAVE_AVX2
112 #include <immintrin.h>
113 
114 static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
115  const int32_t* aVector,
116  const int32_t* bVector,
117  unsigned int num_points)
118 {
119  unsigned int number = 0;
120  const unsigned int oneEightPoints = num_points / 8;
121 
122  int32_t* cPtr = cVector;
123  const int32_t* aPtr = aVector;
124  const int32_t* bPtr = bVector;
125 
126  __m256i aVal, bVal, cVal;
127  for (; number < oneEightPoints; number++) {
128 
129  aVal = _mm256_load_si256((__m256i*)aPtr);
130  bVal = _mm256_load_si256((__m256i*)bPtr);
131 
132  cVal = _mm256_or_si256(aVal, bVal);
133 
134  _mm256_store_si256((__m256i*)cPtr,
135  cVal); // Store the results back into the C container
136 
137  aPtr += 8;
138  bPtr += 8;
139  cPtr += 8;
140  }
141 
142  number = oneEightPoints * 8;
143  for (; number < num_points; number++) {
144  cVector[number] = aVector[number] | bVector[number];
145  }
146 }
147 #endif /* LV_HAVE_AVX2 */
148 
149 
150 #ifdef LV_HAVE_SSE
151 #include <xmmintrin.h>
152 
153 static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
154  const int32_t* aVector,
155  const int32_t* bVector,
156  unsigned int num_points)
157 {
158  unsigned int number = 0;
159  const unsigned int quarterPoints = num_points / 4;
160 
161  float* cPtr = (float*)cVector;
162  const float* aPtr = (float*)aVector;
163  const float* bPtr = (float*)bVector;
164 
165  __m128 aVal, bVal, cVal;
166  for (; number < quarterPoints; number++) {
167  aVal = _mm_load_ps(aPtr);
168  bVal = _mm_load_ps(bPtr);
169 
170  cVal = _mm_or_ps(aVal, bVal);
171 
172  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
173 
174  aPtr += 4;
175  bPtr += 4;
176  cPtr += 4;
177  }
178 
179  number = quarterPoints * 4;
180  for (; number < num_points; number++) {
181  cVector[number] = aVector[number] | bVector[number];
182  }
183 }
184 #endif /* LV_HAVE_SSE */
185 
186 
187 #ifdef LV_HAVE_NEON
188 #include <arm_neon.h>
189 
190 static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
191  const int32_t* aVector,
192  const int32_t* bVector,
193  unsigned int num_points)
194 {
195  int32_t* cPtr = cVector;
196  const int32_t* aPtr = aVector;
197  const int32_t* bPtr = bVector;
198  unsigned int number = 0;
199  unsigned int quarter_points = num_points / 4;
200 
201  int32x4_t a_val, b_val, c_val;
202 
203  for (number = 0; number < quarter_points; number++) {
204  a_val = vld1q_s32(aPtr);
205  b_val = vld1q_s32(bPtr);
206  c_val = vorrq_s32(a_val, b_val);
207  vst1q_s32(cPtr, c_val);
208  aPtr += 4;
209  bPtr += 4;
210  cPtr += 4;
211  }
212 
213  for (number = quarter_points * 4; number < num_points; number++) {
214  *cPtr++ = (*aPtr++) | (*bPtr++);
215  }
216 }
217 #endif /* LV_HAVE_NEON */
218 
219 
220 #ifdef LV_HAVE_GENERIC
221 
222 static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
223  const int32_t* aVector,
224  const int32_t* bVector,
225  unsigned int num_points)
226 {
227  int32_t* cPtr = cVector;
228  const int32_t* aPtr = aVector;
229  const int32_t* bPtr = bVector;
230  unsigned int number = 0;
231 
232  for (number = 0; number < num_points; number++) {
233  *cPtr++ = (*aPtr++) | (*bPtr++);
234  }
235 }
236 #endif /* LV_HAVE_GENERIC */
237 
238 
239 #ifdef LV_HAVE_ORC
240 extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
241  const int32_t* aVector,
242  const int32_t* bVector,
243  unsigned int num_points);
244 
245 static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
246  const int32_t* aVector,
247  const int32_t* bVector,
248  unsigned int num_points)
249 {
250  volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
251 }
252 #endif /* LV_HAVE_ORC */
253 
254 
255 #endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
256 
257 
258 #ifndef INCLUDED_volk_32i_x2_or_32i_u_H
259 #define INCLUDED_volk_32i_x2_or_32i_u_H
260 
261 #include <inttypes.h>
262 #include <stdio.h>
263 
264 #ifdef LV_HAVE_AVX512F
265 #include <immintrin.h>
266 
267 static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
268  const int32_t* aVector,
269  const int32_t* bVector,
270  unsigned int num_points)
271 {
272  unsigned int number = 0;
273  const unsigned int sixteenthPoints = num_points / 16;
274 
275  int32_t* cPtr = (int32_t*)cVector;
276  const int32_t* aPtr = (int32_t*)aVector;
277  const int32_t* bPtr = (int32_t*)bVector;
278 
279  __m512i aVal, bVal, cVal;
280  for (; number < sixteenthPoints; number++) {
281 
282  aVal = _mm512_loadu_si512(aPtr);
283  bVal = _mm512_loadu_si512(bPtr);
284 
285  cVal = _mm512_or_si512(aVal, bVal);
286 
287  _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
288 
289  aPtr += 16;
290  bPtr += 16;
291  cPtr += 16;
292  }
293 
294  number = sixteenthPoints * 16;
295  for (; number < num_points; number++) {
296  cVector[number] = aVector[number] | bVector[number];
297  }
298 }
299 #endif /* LV_HAVE_AVX512F */
300 
301 #ifdef LV_HAVE_AVX2
302 #include <immintrin.h>
303 
304 static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
305  const int32_t* aVector,
306  const int32_t* bVector,
307  unsigned int num_points)
308 {
309  unsigned int number = 0;
310  const unsigned int oneEightPoints = num_points / 8;
311 
312  int32_t* cPtr = cVector;
313  const int32_t* aPtr = aVector;
314  const int32_t* bPtr = bVector;
315 
316  __m256i aVal, bVal, cVal;
317  for (; number < oneEightPoints; number++) {
318 
319  aVal = _mm256_loadu_si256((__m256i*)aPtr);
320  bVal = _mm256_loadu_si256((__m256i*)bPtr);
321 
322  cVal = _mm256_or_si256(aVal, bVal);
323 
324  _mm256_storeu_si256((__m256i*)cPtr,
325  cVal); // Store the results back into the C container
326 
327  aPtr += 8;
328  bPtr += 8;
329  cPtr += 8;
330  }
331 
332  number = oneEightPoints * 8;
333  for (; number < num_points; number++) {
334  cVector[number] = aVector[number] | bVector[number];
335  }
336 }
337 #endif /* LV_HAVE_AVX2 */
338 
339 
340 #endif /* INCLUDED_volk_32i_x2_or_32i_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32i_x2_or_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:190
static void volk_32i_x2_or_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:222
static void volk_32i_x2_or_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:153