Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_multiply_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
56 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
57 #define INCLUDED_volk_32f_s32f_multiply_32f_u_H
58 
59 #include <inttypes.h>
60 #include <stdio.h>
61 
62 #ifdef LV_HAVE_SSE
63 #include <xmmintrin.h>
64 
65 static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
66  const float* aVector,
67  const float scalar,
68  unsigned int num_points)
69 {
70  unsigned int number = 0;
71  const unsigned int quarterPoints = num_points / 4;
72 
73  float* cPtr = cVector;
74  const float* aPtr = aVector;
75 
76  __m128 aVal, bVal, cVal;
77  bVal = _mm_set_ps1(scalar);
78  for (; number < quarterPoints; number++) {
79  aVal = _mm_loadu_ps(aPtr);
80 
81  cVal = _mm_mul_ps(aVal, bVal);
82 
83  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
84 
85  aPtr += 4;
86  cPtr += 4;
87  }
88 
89  number = quarterPoints * 4;
90  for (; number < num_points; number++) {
91  *cPtr++ = (*aPtr++) * scalar;
92  }
93 }
94 #endif /* LV_HAVE_SSE */
95 
96 #ifdef LV_HAVE_AVX
97 #include <immintrin.h>
98 
99 static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
100  const float* aVector,
101  const float scalar,
102  unsigned int num_points)
103 {
104  unsigned int number = 0;
105  const unsigned int eighthPoints = num_points / 8;
106 
107  float* cPtr = cVector;
108  const float* aPtr = aVector;
109 
110  __m256 aVal, bVal, cVal;
111  bVal = _mm256_set1_ps(scalar);
112  for (; number < eighthPoints; number++) {
113 
114  aVal = _mm256_loadu_ps(aPtr);
115 
116  cVal = _mm256_mul_ps(aVal, bVal);
117 
118  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
119 
120  aPtr += 8;
121  cPtr += 8;
122  }
123 
124  number = eighthPoints * 8;
125  for (; number < num_points; number++) {
126  *cPtr++ = (*aPtr++) * scalar;
127  }
128 }
129 #endif /* LV_HAVE_AVX */
130 
131 #ifdef LV_HAVE_GENERIC
132 
133 static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
134  const float* aVector,
135  const float scalar,
136  unsigned int num_points)
137 {
138  unsigned int number = 0;
139  const float* inputPtr = aVector;
140  float* outputPtr = cVector;
141  for (number = 0; number < num_points; number++) {
142  *outputPtr = (*inputPtr) * scalar;
143  inputPtr++;
144  outputPtr++;
145  }
146 }
147 #endif /* LV_HAVE_GENERIC */
148 
149 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
150 
151 
152 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
153 #define INCLUDED_volk_32f_s32f_multiply_32f_a_H
154 
155 #include <inttypes.h>
156 #include <stdio.h>
157 
158 #ifdef LV_HAVE_SSE
159 #include <xmmintrin.h>
160 
161 static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
162  const float* aVector,
163  const float scalar,
164  unsigned int num_points)
165 {
166  unsigned int number = 0;
167  const unsigned int quarterPoints = num_points / 4;
168 
169  float* cPtr = cVector;
170  const float* aPtr = aVector;
171 
172  __m128 aVal, bVal, cVal;
173  bVal = _mm_set_ps1(scalar);
174  for (; number < quarterPoints; number++) {
175  aVal = _mm_load_ps(aPtr);
176 
177  cVal = _mm_mul_ps(aVal, bVal);
178 
179  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
180 
181  aPtr += 4;
182  cPtr += 4;
183  }
184 
185  number = quarterPoints * 4;
186  for (; number < num_points; number++) {
187  *cPtr++ = (*aPtr++) * scalar;
188  }
189 }
190 #endif /* LV_HAVE_SSE */
191 
192 #ifdef LV_HAVE_AVX
193 #include <immintrin.h>
194 
195 static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
196  const float* aVector,
197  const float scalar,
198  unsigned int num_points)
199 {
200  unsigned int number = 0;
201  const unsigned int eighthPoints = num_points / 8;
202 
203  float* cPtr = cVector;
204  const float* aPtr = aVector;
205 
206  __m256 aVal, bVal, cVal;
207  bVal = _mm256_set1_ps(scalar);
208  for (; number < eighthPoints; number++) {
209  aVal = _mm256_load_ps(aPtr);
210 
211  cVal = _mm256_mul_ps(aVal, bVal);
212 
213  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
214 
215  aPtr += 8;
216  cPtr += 8;
217  }
218 
219  number = eighthPoints * 8;
220  for (; number < num_points; number++) {
221  *cPtr++ = (*aPtr++) * scalar;
222  }
223 }
224 #endif /* LV_HAVE_AVX */
225 
226 #ifdef LV_HAVE_NEON
227 #include <arm_neon.h>
228 
229 static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
230  const float* aVector,
231  const float scalar,
232  unsigned int num_points)
233 {
234  unsigned int number = 0;
235  const float* inputPtr = aVector;
236  float* outputPtr = cVector;
237  const unsigned int quarterPoints = num_points / 4;
238 
239  float32x4_t aVal, cVal;
240 
241  for (number = 0; number < quarterPoints; number++) {
242  aVal = vld1q_f32(inputPtr); // Load into NEON regs
243  cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
244  vst1q_f32(outputPtr, cVal); // Store results back to output
245  inputPtr += 4;
246  outputPtr += 4;
247  }
248  for (number = quarterPoints * 4; number < num_points; number++) {
249  *outputPtr++ = (*inputPtr++) * scalar;
250  }
251 }
252 #endif /* LV_HAVE_NEON */
253 
254 
255 #ifdef LV_HAVE_GENERIC
256 
257 static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
258  const float* aVector,
259  const float scalar,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263  const float* inputPtr = aVector;
264  float* outputPtr = cVector;
265  for (number = 0; number < num_points; number++) {
266  *outputPtr = (*inputPtr) * scalar;
267  inputPtr++;
268  outputPtr++;
269  }
270 }
271 #endif /* LV_HAVE_GENERIC */
272 
273 
274 #ifdef LV_HAVE_ORC
275 
276 extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
277  const float* src,
278  const float scalar,
279  unsigned int num_points);
280 
281 static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
282  const float* aVector,
283  const float scalar,
284  unsigned int num_points)
285 {
286  volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
287 }
288 
289 #endif /* LV_HAVE_GENERIC */
290 
291 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_s32f_multiply_32f_a_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:195
static void volk_32f_s32f_multiply_32f_a_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:257
static void volk_32f_s32f_multiply_32f_u_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:65
static void volk_32f_s32f_multiply_32f_u_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:99
static void volk_32f_s32f_multiply_32f_a_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:161
static void volk_32f_s32f_multiply_32f_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:133
static void volk_32f_s32f_multiply_32f_u_neon(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:229