Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_x2_subtract_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
59 #define INCLUDED_volk_32f_x2_subtract_32f_a_H
60 
61 #include <inttypes.h>
62 #include <stdio.h>
63 
64 #ifdef LV_HAVE_AVX512F
65 #include <immintrin.h>
66 
67 static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
68  const float* aVector,
69  const float* bVector,
70  unsigned int num_points)
71 {
72  unsigned int number = 0;
73  const unsigned int sixteenthPoints = num_points / 16;
74 
75  float* cPtr = cVector;
76  const float* aPtr = aVector;
77  const float* bPtr = bVector;
78 
79  __m512 aVal, bVal, cVal;
80  for (; number < sixteenthPoints; number++) {
81 
82  aVal = _mm512_load_ps(aPtr);
83  bVal = _mm512_load_ps(bPtr);
84 
85  cVal = _mm512_sub_ps(aVal, bVal);
86 
87  _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
88 
89  aPtr += 16;
90  bPtr += 16;
91  cPtr += 16;
92  }
93 
94  number = sixteenthPoints * 16;
95  for (; number < num_points; number++) {
96  *cPtr++ = (*aPtr++) - (*bPtr++);
97  }
98 }
99 #endif /* LV_HAVE_AVX512F */
100 
101 #ifdef LV_HAVE_AVX
102 #include <immintrin.h>
103 
104 static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
105  const float* aVector,
106  const float* bVector,
107  unsigned int num_points)
108 {
109  unsigned int number = 0;
110  const unsigned int eighthPoints = num_points / 8;
111 
112  float* cPtr = cVector;
113  const float* aPtr = aVector;
114  const float* bPtr = bVector;
115 
116  __m256 aVal, bVal, cVal;
117  for (; number < eighthPoints; number++) {
118 
119  aVal = _mm256_load_ps(aPtr);
120  bVal = _mm256_load_ps(bPtr);
121 
122  cVal = _mm256_sub_ps(aVal, bVal);
123 
124  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
125 
126  aPtr += 8;
127  bPtr += 8;
128  cPtr += 8;
129  }
130 
131  number = eighthPoints * 8;
132  for (; number < num_points; number++) {
133  *cPtr++ = (*aPtr++) - (*bPtr++);
134  }
135 }
136 #endif /* LV_HAVE_AVX */
137 
138 #ifdef LV_HAVE_SSE
139 #include <xmmintrin.h>
140 
141 static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
142  const float* aVector,
143  const float* bVector,
144  unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const unsigned int quarterPoints = num_points / 4;
148 
149  float* cPtr = cVector;
150  const float* aPtr = aVector;
151  const float* bPtr = bVector;
152 
153  __m128 aVal, bVal, cVal;
154  for (; number < quarterPoints; number++) {
155 
156  aVal = _mm_load_ps(aPtr);
157  bVal = _mm_load_ps(bPtr);
158 
159  cVal = _mm_sub_ps(aVal, bVal);
160 
161  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
162 
163  aPtr += 4;
164  bPtr += 4;
165  cPtr += 4;
166  }
167 
168  number = quarterPoints * 4;
169  for (; number < num_points; number++) {
170  *cPtr++ = (*aPtr++) - (*bPtr++);
171  }
172 }
173 #endif /* LV_HAVE_SSE */
174 
175 
176 #ifdef LV_HAVE_GENERIC
177 
178 static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
179  const float* aVector,
180  const float* bVector,
181  unsigned int num_points)
182 {
183  float* cPtr = cVector;
184  const float* aPtr = aVector;
185  const float* bPtr = bVector;
186  unsigned int number = 0;
187 
188  for (number = 0; number < num_points; number++) {
189  *cPtr++ = (*aPtr++) - (*bPtr++);
190  }
191 }
192 #endif /* LV_HAVE_GENERIC */
193 
194 
195 #ifdef LV_HAVE_NEON
196 #include <arm_neon.h>
197 
198 static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
199  const float* aVector,
200  const float* bVector,
201  unsigned int num_points)
202 {
203  float* cPtr = cVector;
204  const float* aPtr = aVector;
205  const float* bPtr = bVector;
206  unsigned int number = 0;
207  unsigned int quarter_points = num_points / 4;
208 
209  float32x4_t a_vec, b_vec, c_vec;
210 
211  for (number = 0; number < quarter_points; number++) {
212  a_vec = vld1q_f32(aPtr);
213  b_vec = vld1q_f32(bPtr);
214  c_vec = vsubq_f32(a_vec, b_vec);
215  vst1q_f32(cPtr, c_vec);
216  aPtr += 4;
217  bPtr += 4;
218  cPtr += 4;
219  }
220 
221  for (number = quarter_points * 4; number < num_points; number++) {
222  *cPtr++ = (*aPtr++) - (*bPtr++);
223  }
224 }
225 #endif /* LV_HAVE_NEON */
226 
227 
228 #ifdef LV_HAVE_ORC
229 extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
230  const float* aVector,
231  const float* bVector,
232  unsigned int num_points);
233 
234 static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
235  const float* aVector,
236  const float* bVector,
237  unsigned int num_points)
238 {
239  volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
240 }
241 #endif /* LV_HAVE_ORC */
242 
243 
244 #endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
245 
246 
247 #ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
248 #define INCLUDED_volk_32f_x2_subtract_32f_u_H
249 
250 #include <inttypes.h>
251 #include <stdio.h>
252 
253 #ifdef LV_HAVE_AVX512F
254 #include <immintrin.h>
255 
256 static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
257  const float* aVector,
258  const float* bVector,
259  unsigned int num_points)
260 {
261  unsigned int number = 0;
262  const unsigned int sixteenthPoints = num_points / 16;
263 
264  float* cPtr = cVector;
265  const float* aPtr = aVector;
266  const float* bPtr = bVector;
267 
268  __m512 aVal, bVal, cVal;
269  for (; number < sixteenthPoints; number++) {
270 
271  aVal = _mm512_loadu_ps(aPtr);
272  bVal = _mm512_loadu_ps(bPtr);
273 
274  cVal = _mm512_sub_ps(aVal, bVal);
275 
276  _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
277 
278  aPtr += 16;
279  bPtr += 16;
280  cPtr += 16;
281  }
282 
283  number = sixteenthPoints * 16;
284  for (; number < num_points; number++) {
285  *cPtr++ = (*aPtr++) - (*bPtr++);
286  }
287 }
288 #endif /* LV_HAVE_AVX512F */
289 
290 
291 #ifdef LV_HAVE_AVX
292 #include <immintrin.h>
293 
294 static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
295  const float* aVector,
296  const float* bVector,
297  unsigned int num_points)
298 {
299  unsigned int number = 0;
300  const unsigned int eighthPoints = num_points / 8;
301 
302  float* cPtr = cVector;
303  const float* aPtr = aVector;
304  const float* bPtr = bVector;
305 
306  __m256 aVal, bVal, cVal;
307  for (; number < eighthPoints; number++) {
308 
309  aVal = _mm256_loadu_ps(aPtr);
310  bVal = _mm256_loadu_ps(bPtr);
311 
312  cVal = _mm256_sub_ps(aVal, bVal);
313 
314  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
315 
316  aPtr += 8;
317  bPtr += 8;
318  cPtr += 8;
319  }
320 
321  number = eighthPoints * 8;
322  for (; number < num_points; number++) {
323  *cPtr++ = (*aPtr++) - (*bPtr++);
324  }
325 }
326 #endif /* LV_HAVE_AVX */
327 
328 #endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32f_x2_subtract_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:198
static void volk_32f_x2_subtract_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:104
static void volk_32f_x2_subtract_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:294
static void volk_32f_x2_subtract_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:141
static void volk_32f_x2_subtract_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:178