58 #ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
59 #define INCLUDED_volk_32f_x2_subtract_32f_a_H
64 #ifdef LV_HAVE_AVX512F
65 #include <immintrin.h>
67 static inline void volk_32f_x2_subtract_32f_a_avx512f(
float* cVector,
70 unsigned int num_points)
72 unsigned int number = 0;
73 const unsigned int sixteenthPoints = num_points / 16;
75 float* cPtr = cVector;
76 const float* aPtr = aVector;
77 const float* bPtr = bVector;
79 __m512 aVal, bVal, cVal;
80 for (; number < sixteenthPoints; number++) {
82 aVal = _mm512_load_ps(aPtr);
83 bVal = _mm512_load_ps(bPtr);
85 cVal = _mm512_sub_ps(aVal, bVal);
87 _mm512_store_ps(cPtr, cVal);
94 number = sixteenthPoints * 16;
95 for (; number < num_points; number++) {
96 *cPtr++ = (*aPtr++) - (*bPtr++);
102 #include <immintrin.h>
105 const float* aVector,
106 const float* bVector,
107 unsigned int num_points)
109 unsigned int number = 0;
110 const unsigned int eighthPoints = num_points / 8;
112 float* cPtr = cVector;
113 const float* aPtr = aVector;
114 const float* bPtr = bVector;
116 __m256 aVal, bVal, cVal;
117 for (; number < eighthPoints; number++) {
119 aVal = _mm256_load_ps(aPtr);
120 bVal = _mm256_load_ps(bPtr);
122 cVal = _mm256_sub_ps(aVal, bVal);
124 _mm256_store_ps(cPtr, cVal);
131 number = eighthPoints * 8;
132 for (; number < num_points; number++) {
133 *cPtr++ = (*aPtr++) - (*bPtr++);
139 #include <xmmintrin.h>
142 const float* aVector,
143 const float* bVector,
144 unsigned int num_points)
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
149 float* cPtr = cVector;
150 const float* aPtr = aVector;
151 const float* bPtr = bVector;
154 for (; number < quarterPoints; number++) {
168 number = quarterPoints * 4;
169 for (; number < num_points; number++) {
170 *cPtr++ = (*aPtr++) - (*bPtr++);
176 #ifdef LV_HAVE_GENERIC
179 const float* aVector,
180 const float* bVector,
181 unsigned int num_points)
183 float* cPtr = cVector;
184 const float* aPtr = aVector;
185 const float* bPtr = bVector;
186 unsigned int number = 0;
188 for (number = 0; number < num_points; number++) {
189 *cPtr++ = (*aPtr++) - (*bPtr++);
196 #include <arm_neon.h>
199 const float* aVector,
200 const float* bVector,
201 unsigned int num_points)
203 float* cPtr = cVector;
204 const float* aPtr = aVector;
205 const float* bPtr = bVector;
206 unsigned int number = 0;
207 unsigned int quarter_points = num_points / 4;
209 float32x4_t a_vec, b_vec, c_vec;
211 for (number = 0; number < quarter_points; number++) {
212 a_vec = vld1q_f32(aPtr);
213 b_vec = vld1q_f32(bPtr);
214 c_vec = vsubq_f32(a_vec, b_vec);
215 vst1q_f32(cPtr, c_vec);
221 for (number = quarter_points * 4; number < num_points; number++) {
222 *cPtr++ = (*aPtr++) - (*bPtr++);
229 extern void volk_32f_x2_subtract_32f_a_orc_impl(
float* cVector,
230 const float* aVector,
231 const float* bVector,
232 unsigned int num_points);
234 static inline void volk_32f_x2_subtract_32f_u_orc(
float* cVector,
235 const float* aVector,
236 const float* bVector,
237 unsigned int num_points)
239 volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
247 #ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
248 #define INCLUDED_volk_32f_x2_subtract_32f_u_H
250 #include <inttypes.h>
253 #ifdef LV_HAVE_AVX512F
254 #include <immintrin.h>
256 static inline void volk_32f_x2_subtract_32f_u_avx512f(
float* cVector,
257 const float* aVector,
258 const float* bVector,
259 unsigned int num_points)
261 unsigned int number = 0;
262 const unsigned int sixteenthPoints = num_points / 16;
264 float* cPtr = cVector;
265 const float* aPtr = aVector;
266 const float* bPtr = bVector;
268 __m512 aVal, bVal, cVal;
269 for (; number < sixteenthPoints; number++) {
271 aVal = _mm512_loadu_ps(aPtr);
272 bVal = _mm512_loadu_ps(bPtr);
274 cVal = _mm512_sub_ps(aVal, bVal);
276 _mm512_storeu_ps(cPtr, cVal);
283 number = sixteenthPoints * 16;
284 for (; number < num_points; number++) {
285 *cPtr++ = (*aPtr++) - (*bPtr++);
292 #include <immintrin.h>
295 const float* aVector,
296 const float* bVector,
297 unsigned int num_points)
299 unsigned int number = 0;
300 const unsigned int eighthPoints = num_points / 8;
302 float* cPtr = cVector;
303 const float* aPtr = aVector;
304 const float* bPtr = bVector;
306 __m256 aVal, bVal, cVal;
307 for (; number < eighthPoints; number++) {
309 aVal = _mm256_loadu_ps(aPtr);
310 bVal = _mm256_loadu_ps(bPtr);
312 cVal = _mm256_sub_ps(aVal, bVal);
314 _mm256_storeu_ps(cPtr, cVal);
321 number = eighthPoints * 8;
322 for (; number < num_points; number++) {
323 *cPtr++ = (*aPtr++) - (*bPtr++);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_subtract_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:198
static void volk_32f_x2_subtract_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:104
static void volk_32f_x2_subtract_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:294
static void volk_32f_x2_subtract_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:141
static void volk_32f_x2_subtract_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:178