Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_x2_multiply_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
59 #define INCLUDED_volk_32f_x2_multiply_32f_u_H
60 
61 #include <inttypes.h>
62 #include <stdio.h>
63 
64 #ifdef LV_HAVE_SSE
65 #include <xmmintrin.h>
66 
67 static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector,
68  const float* aVector,
69  const float* bVector,
70  unsigned int num_points)
71 {
72  unsigned int number = 0;
73  const unsigned int quarterPoints = num_points / 4;
74 
75  float* cPtr = cVector;
76  const float* aPtr = aVector;
77  const float* bPtr = bVector;
78 
79  __m128 aVal, bVal, cVal;
80  for (; number < quarterPoints; number++) {
81 
82  aVal = _mm_loadu_ps(aPtr);
83  bVal = _mm_loadu_ps(bPtr);
84 
85  cVal = _mm_mul_ps(aVal, bVal);
86 
87  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
88 
89  aPtr += 4;
90  bPtr += 4;
91  cPtr += 4;
92  }
93 
94  number = quarterPoints * 4;
95  for (; number < num_points; number++) {
96  *cPtr++ = (*aPtr++) * (*bPtr++);
97  }
98 }
99 #endif /* LV_HAVE_SSE */
100 
101 #ifdef LV_HAVE_AVX512F
102 #include <immintrin.h>
103 
104 static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector,
105  const float* aVector,
106  const float* bVector,
107  unsigned int num_points)
108 {
109  unsigned int number = 0;
110  const unsigned int sixteenthPoints = num_points / 16;
111 
112  float* cPtr = cVector;
113  const float* aPtr = aVector;
114  const float* bPtr = bVector;
115 
116  __m512 aVal, bVal, cVal;
117  for (; number < sixteenthPoints; number++) {
118 
119  aVal = _mm512_loadu_ps(aPtr);
120  bVal = _mm512_loadu_ps(bPtr);
121 
122  cVal = _mm512_mul_ps(aVal, bVal);
123 
124  _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
125 
126  aPtr += 16;
127  bPtr += 16;
128  cPtr += 16;
129  }
130 
131  number = sixteenthPoints * 16;
132  for (; number < num_points; number++) {
133  *cPtr++ = (*aPtr++) * (*bPtr++);
134  }
135 }
136 #endif /* LV_HAVE_AVX512F */
137 
138 #ifdef LV_HAVE_AVX
139 #include <immintrin.h>
140 
141 static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector,
142  const float* aVector,
143  const float* bVector,
144  unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const unsigned int eighthPoints = num_points / 8;
148 
149  float* cPtr = cVector;
150  const float* aPtr = aVector;
151  const float* bPtr = bVector;
152 
153  __m256 aVal, bVal, cVal;
154  for (; number < eighthPoints; number++) {
155 
156  aVal = _mm256_loadu_ps(aPtr);
157  bVal = _mm256_loadu_ps(bPtr);
158 
159  cVal = _mm256_mul_ps(aVal, bVal);
160 
161  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
162 
163  aPtr += 8;
164  bPtr += 8;
165  cPtr += 8;
166  }
167 
168  number = eighthPoints * 8;
169  for (; number < num_points; number++) {
170  *cPtr++ = (*aPtr++) * (*bPtr++);
171  }
172 }
173 #endif /* LV_HAVE_AVX */
174 
175 
176 #ifdef LV_HAVE_GENERIC
177 
178 static inline void volk_32f_x2_multiply_32f_generic(float* cVector,
179  const float* aVector,
180  const float* bVector,
181  unsigned int num_points)
182 {
183  float* cPtr = cVector;
184  const float* aPtr = aVector;
185  const float* bPtr = bVector;
186  unsigned int number = 0;
187 
188  for (number = 0; number < num_points; number++) {
189  *cPtr++ = (*aPtr++) * (*bPtr++);
190  }
191 }
192 #endif /* LV_HAVE_GENERIC */
193 
194 
195 #endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
196 
197 
198 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
199 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
200 
201 #include <inttypes.h>
202 #include <stdio.h>
203 
204 #ifdef LV_HAVE_SSE
205 #include <xmmintrin.h>
206 
207 static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector,
208  const float* aVector,
209  const float* bVector,
210  unsigned int num_points)
211 {
212  unsigned int number = 0;
213  const unsigned int quarterPoints = num_points / 4;
214 
215  float* cPtr = cVector;
216  const float* aPtr = aVector;
217  const float* bPtr = bVector;
218 
219  __m128 aVal, bVal, cVal;
220  for (; number < quarterPoints; number++) {
221 
222  aVal = _mm_load_ps(aPtr);
223  bVal = _mm_load_ps(bPtr);
224 
225  cVal = _mm_mul_ps(aVal, bVal);
226 
227  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
228 
229  aPtr += 4;
230  bPtr += 4;
231  cPtr += 4;
232  }
233 
234  number = quarterPoints * 4;
235  for (; number < num_points; number++) {
236  *cPtr++ = (*aPtr++) * (*bPtr++);
237  }
238 }
239 #endif /* LV_HAVE_SSE */
240 
241 #ifdef LV_HAVE_AVX512F
242 #include <immintrin.h>
243 
244 static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector,
245  const float* aVector,
246  const float* bVector,
247  unsigned int num_points)
248 {
249  unsigned int number = 0;
250  const unsigned int sixteenthPoints = num_points / 16;
251 
252  float* cPtr = cVector;
253  const float* aPtr = aVector;
254  const float* bPtr = bVector;
255 
256  __m512 aVal, bVal, cVal;
257  for (; number < sixteenthPoints; number++) {
258 
259  aVal = _mm512_load_ps(aPtr);
260  bVal = _mm512_load_ps(bPtr);
261 
262  cVal = _mm512_mul_ps(aVal, bVal);
263 
264  _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
265 
266  aPtr += 16;
267  bPtr += 16;
268  cPtr += 16;
269  }
270 
271  number = sixteenthPoints * 16;
272  for (; number < num_points; number++) {
273  *cPtr++ = (*aPtr++) * (*bPtr++);
274  }
275 }
276 #endif /* LV_HAVE_AVX512F */
277 
278 
279 #ifdef LV_HAVE_AVX
280 #include <immintrin.h>
281 
282 static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector,
283  const float* aVector,
284  const float* bVector,
285  unsigned int num_points)
286 {
287  unsigned int number = 0;
288  const unsigned int eighthPoints = num_points / 8;
289 
290  float* cPtr = cVector;
291  const float* aPtr = aVector;
292  const float* bPtr = bVector;
293 
294  __m256 aVal, bVal, cVal;
295  for (; number < eighthPoints; number++) {
296 
297  aVal = _mm256_load_ps(aPtr);
298  bVal = _mm256_load_ps(bPtr);
299 
300  cVal = _mm256_mul_ps(aVal, bVal);
301 
302  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
303 
304  aPtr += 8;
305  bPtr += 8;
306  cPtr += 8;
307  }
308 
309  number = eighthPoints * 8;
310  for (; number < num_points; number++) {
311  *cPtr++ = (*aPtr++) * (*bPtr++);
312  }
313 }
314 #endif /* LV_HAVE_AVX */
315 
316 
317 #ifdef LV_HAVE_NEON
318 #include <arm_neon.h>
319 
320 static inline void volk_32f_x2_multiply_32f_neon(float* cVector,
321  const float* aVector,
322  const float* bVector,
323  unsigned int num_points)
324 {
325  const unsigned int quarter_points = num_points / 4;
326  unsigned int number;
327  float32x4_t avec, bvec, cvec;
328  for (number = 0; number < quarter_points; ++number) {
329  avec = vld1q_f32(aVector);
330  bvec = vld1q_f32(bVector);
331  cvec = vmulq_f32(avec, bvec);
332  vst1q_f32(cVector, cvec);
333  aVector += 4;
334  bVector += 4;
335  cVector += 4;
336  }
337  for (number = quarter_points * 4; number < num_points; ++number) {
338  *cVector++ = *aVector++ * *bVector++;
339  }
340 }
341 #endif /* LV_HAVE_NEON */
342 
343 
344 #ifdef LV_HAVE_GENERIC
345 
346 static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector,
347  const float* aVector,
348  const float* bVector,
349  unsigned int num_points)
350 {
351  float* cPtr = cVector;
352  const float* aPtr = aVector;
353  const float* bPtr = bVector;
354  unsigned int number = 0;
355 
356  for (number = 0; number < num_points; number++) {
357  *cPtr++ = (*aPtr++) * (*bPtr++);
358  }
359 }
360 #endif /* LV_HAVE_GENERIC */
361 
362 
363 #ifdef LV_HAVE_ORC
364 extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector,
365  const float* aVector,
366  const float* bVector,
367  unsigned int num_points);
368 
369 static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
370  const float* aVector,
371  const float* bVector,
372  unsigned int num_points)
373 {
374  volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
375 }
376 #endif /* LV_HAVE_ORC */
377 
378 
379 #endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_multiply_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:67
static void volk_32f_x2_multiply_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:178
static void volk_32f_x2_multiply_32f_a_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:346
static void volk_32f_x2_multiply_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:207
static void volk_32f_x2_multiply_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:282
static void volk_32f_x2_multiply_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:141
static void volk_32f_x2_multiply_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:320