58 #ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
59 #define INCLUDED_volk_32f_x2_multiply_32f_u_H
65 #include <xmmintrin.h>
70 unsigned int num_points)
72 unsigned int number = 0;
73 const unsigned int quarterPoints = num_points / 4;
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
80 for (; number < quarterPoints; number++) {
94 number = quarterPoints * 4;
95 for (; number < num_points; number++) {
96 *cPtr++ = (*aPtr++) * (*bPtr++);
101 #ifdef LV_HAVE_AVX512F
102 #include <immintrin.h>
104 static inline void volk_32f_x2_multiply_32f_u_avx512f(
float* cVector,
105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
109 unsigned int number = 0;
110 const unsigned int sixteenthPoints = num_points / 16;
112 float* cPtr = cVector;
113 const float* aPtr = aVector;
114 const float* bPtr = bVector;
116 __m512 aVal, bVal, cVal;
117 for (; number < sixteenthPoints; number++) {
119 aVal = _mm512_loadu_ps(aPtr);
120 bVal = _mm512_loadu_ps(bPtr);
122 cVal = _mm512_mul_ps(aVal, bVal);
124 _mm512_storeu_ps(cPtr, cVal);
131 number = sixteenthPoints * 16;
132 for (; number < num_points; number++) {
133 *cPtr++ = (*aPtr++) * (*bPtr++);
139 #include <immintrin.h>
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
146 unsigned int number = 0;
147 const unsigned int eighthPoints = num_points / 8;
149 float* cPtr = cVector;
150 const float* aPtr = aVector;
151 const float* bPtr = bVector;
153 __m256 aVal, bVal, cVal;
154 for (; number < eighthPoints; number++) {
156 aVal = _mm256_loadu_ps(aPtr);
157 bVal = _mm256_loadu_ps(bPtr);
159 cVal = _mm256_mul_ps(aVal, bVal);
161 _mm256_storeu_ps(cPtr, cVal);
168 number = eighthPoints * 8;
169 for (; number < num_points; number++) {
170 *cPtr++ = (*aPtr++) * (*bPtr++);
176 #ifdef LV_HAVE_GENERIC
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
183 float* cPtr = cVector;
184 const float* aPtr = aVector;
185 const float* bPtr = bVector;
186 unsigned int number = 0;
188 for (number = 0; number < num_points; number++) {
189 *cPtr++ = (*aPtr++) * (*bPtr++);
198 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
199 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
201 #include <inttypes.h>
205 #include <xmmintrin.h>
208 const float* aVector,
209 const float* bVector,
210 unsigned int num_points)
212 unsigned int number = 0;
213 const unsigned int quarterPoints = num_points / 4;
215 float* cPtr = cVector;
216 const float* aPtr = aVector;
217 const float* bPtr = bVector;
220 for (; number < quarterPoints; number++) {
234 number = quarterPoints * 4;
235 for (; number < num_points; number++) {
236 *cPtr++ = (*aPtr++) * (*bPtr++);
241 #ifdef LV_HAVE_AVX512F
242 #include <immintrin.h>
244 static inline void volk_32f_x2_multiply_32f_a_avx512f(
float* cVector,
245 const float* aVector,
246 const float* bVector,
247 unsigned int num_points)
249 unsigned int number = 0;
250 const unsigned int sixteenthPoints = num_points / 16;
252 float* cPtr = cVector;
253 const float* aPtr = aVector;
254 const float* bPtr = bVector;
256 __m512 aVal, bVal, cVal;
257 for (; number < sixteenthPoints; number++) {
259 aVal = _mm512_load_ps(aPtr);
260 bVal = _mm512_load_ps(bPtr);
262 cVal = _mm512_mul_ps(aVal, bVal);
264 _mm512_store_ps(cPtr, cVal);
271 number = sixteenthPoints * 16;
272 for (; number < num_points; number++) {
273 *cPtr++ = (*aPtr++) * (*bPtr++);
280 #include <immintrin.h>
283 const float* aVector,
284 const float* bVector,
285 unsigned int num_points)
287 unsigned int number = 0;
288 const unsigned int eighthPoints = num_points / 8;
290 float* cPtr = cVector;
291 const float* aPtr = aVector;
292 const float* bPtr = bVector;
294 __m256 aVal, bVal, cVal;
295 for (; number < eighthPoints; number++) {
297 aVal = _mm256_load_ps(aPtr);
298 bVal = _mm256_load_ps(bPtr);
300 cVal = _mm256_mul_ps(aVal, bVal);
302 _mm256_store_ps(cPtr, cVal);
309 number = eighthPoints * 8;
310 for (; number < num_points; number++) {
311 *cPtr++ = (*aPtr++) * (*bPtr++);
318 #include <arm_neon.h>
321 const float* aVector,
322 const float* bVector,
323 unsigned int num_points)
325 const unsigned int quarter_points = num_points / 4;
327 float32x4_t avec, bvec, cvec;
328 for (number = 0; number < quarter_points; ++number) {
329 avec = vld1q_f32(aVector);
330 bvec = vld1q_f32(bVector);
331 cvec = vmulq_f32(avec, bvec);
332 vst1q_f32(cVector, cvec);
337 for (number = quarter_points * 4; number < num_points; ++number) {
338 *cVector++ = *aVector++ * *bVector++;
344 #ifdef LV_HAVE_GENERIC
347 const float* aVector,
348 const float* bVector,
349 unsigned int num_points)
351 float* cPtr = cVector;
352 const float* aPtr = aVector;
353 const float* bPtr = bVector;
354 unsigned int number = 0;
356 for (number = 0; number < num_points; number++) {
357 *cPtr++ = (*aPtr++) * (*bPtr++);
364 extern void volk_32f_x2_multiply_32f_a_orc_impl(
float* cVector,
365 const float* aVector,
366 const float* bVector,
367 unsigned int num_points);
369 static inline void volk_32f_x2_multiply_32f_u_orc(
float* cVector,
370 const float* aVector,
371 const float* bVector,
372 unsigned int num_points)
374 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_multiply_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:67
static void volk_32f_x2_multiply_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:178
static void volk_32f_x2_multiply_32f_a_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:346
static void volk_32f_x2_multiply_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:207
static void volk_32f_x2_multiply_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:282
static void volk_32f_x2_multiply_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:141
static void volk_32f_x2_multiply_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:320