Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_32f_fm_detect_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
44 #ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
45 #define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
46 
47 #include <inttypes.h>
48 #include <stdio.h>
49 
50 #ifdef LV_HAVE_AVX
51 #include <immintrin.h>
52 
53 static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector,
54  const float* inputVector,
55  const float bound,
56  float* saveValue,
57  unsigned int num_points)
58 {
59  if (num_points < 1) {
60  return;
61  }
62  unsigned int number = 1;
63  unsigned int j = 0;
64  // num_points-1 keeps Fedora 7's gcc from crashing...
65  // num_points won't work. :(
66  const unsigned int eighthPoints = (num_points - 1) / 8;
67 
68  float* outPtr = outputVector;
69  const float* inPtr = inputVector;
70  __m256 upperBound = _mm256_set1_ps(bound);
71  __m256 lowerBound = _mm256_set1_ps(-bound);
72  __m256 next3old1;
73  __m256 next4;
74  __m256 boundAdjust;
75  __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
76  __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
77  // Do the first 8 by hand since we're going in from the saveValue:
78  *outPtr = *inPtr - *saveValue;
79  if (*outPtr > bound)
80  *outPtr -= 2 * bound;
81  if (*outPtr < -bound)
82  *outPtr += 2 * bound;
83  inPtr++;
84  outPtr++;
85  for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
86  *outPtr = *(inPtr) - *(inPtr - 1);
87  if (*outPtr > bound)
88  *outPtr -= 2 * bound;
89  if (*outPtr < -bound)
90  *outPtr += 2 * bound;
91  inPtr++;
92  outPtr++;
93  }
94 
95  for (; number < eighthPoints; number++) {
96  // Load data
97  next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
98  next4 = _mm256_load_ps(inPtr);
99  inPtr += 8;
100  // Subtract and store:
101  next3old1 = _mm256_sub_ps(next4, next3old1);
102  // Bound:
103  boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
104  boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
105  next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
106  next4 = _mm256_and_ps(next4, negBoundAdjust);
107  boundAdjust = _mm256_or_ps(next4, boundAdjust);
108  // Make sure we're in the bounding interval:
109  next3old1 = _mm256_add_ps(next3old1, boundAdjust);
110  _mm256_store_ps(outPtr, next3old1); // Store the results back into the output
111  outPtr += 8;
112  }
113 
114  for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
115  number++) {
116  *outPtr = *(inPtr) - *(inPtr - 1);
117  if (*outPtr > bound)
118  *outPtr -= 2 * bound;
119  if (*outPtr < -bound)
120  *outPtr += 2 * bound;
121  inPtr++;
122  outPtr++;
123  }
124 
125  *saveValue = inputVector[num_points - 1];
126 }
127 #endif /* LV_HAVE_AVX */
128 
129 
130 #ifdef LV_HAVE_SSE
131 #include <xmmintrin.h>
132 
133 static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector,
134  const float* inputVector,
135  const float bound,
136  float* saveValue,
137  unsigned int num_points)
138 {
139  if (num_points < 1) {
140  return;
141  }
142  unsigned int number = 1;
143  unsigned int j = 0;
144  // num_points-1 keeps Fedora 7's gcc from crashing...
145  // num_points won't work. :(
146  const unsigned int quarterPoints = (num_points - 1) / 4;
147 
148  float* outPtr = outputVector;
149  const float* inPtr = inputVector;
150  __m128 upperBound = _mm_set_ps1(bound);
151  __m128 lowerBound = _mm_set_ps1(-bound);
152  __m128 next3old1;
153  __m128 next4;
154  __m128 boundAdjust;
155  __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above.
156  __m128 negBoundAdjust = _mm_set_ps1(2 * bound); // Add when we're below.
157  // Do the first 4 by hand since we're going in from the saveValue:
158  *outPtr = *inPtr - *saveValue;
159  if (*outPtr > bound)
160  *outPtr -= 2 * bound;
161  if (*outPtr < -bound)
162  *outPtr += 2 * bound;
163  inPtr++;
164  outPtr++;
165  for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
166  *outPtr = *(inPtr) - *(inPtr - 1);
167  if (*outPtr > bound)
168  *outPtr -= 2 * bound;
169  if (*outPtr < -bound)
170  *outPtr += 2 * bound;
171  inPtr++;
172  outPtr++;
173  }
174 
175  for (; number < quarterPoints; number++) {
176  // Load data
177  next3old1 = _mm_loadu_ps((float*)(inPtr - 1));
178  next4 = _mm_load_ps(inPtr);
179  inPtr += 4;
180  // Subtract and store:
181  next3old1 = _mm_sub_ps(next4, next3old1);
182  // Bound:
183  boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
184  boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
185  next4 = _mm_cmplt_ps(next3old1, lowerBound);
186  next4 = _mm_and_ps(next4, negBoundAdjust);
187  boundAdjust = _mm_or_ps(next4, boundAdjust);
188  // Make sure we're in the bounding interval:
189  next3old1 = _mm_add_ps(next3old1, boundAdjust);
190  _mm_store_ps(outPtr, next3old1); // Store the results back into the output
191  outPtr += 4;
192  }
193 
194  for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
195  number < num_points;
196  number++) {
197  *outPtr = *(inPtr) - *(inPtr - 1);
198  if (*outPtr > bound)
199  *outPtr -= 2 * bound;
200  if (*outPtr < -bound)
201  *outPtr += 2 * bound;
202  inPtr++;
203  outPtr++;
204  }
205 
206  *saveValue = inputVector[num_points - 1];
207 }
208 #endif /* LV_HAVE_SSE */
209 
210 #ifdef LV_HAVE_GENERIC
211 
212 static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
213  const float* inputVector,
214  const float bound,
215  float* saveValue,
216  unsigned int num_points)
217 {
218  if (num_points < 1) {
219  return;
220  }
221  unsigned int number = 0;
222  float* outPtr = outputVector;
223  const float* inPtr = inputVector;
224 
225  // Do the first 1 by hand since we're going in from the saveValue:
226  *outPtr = *inPtr - *saveValue;
227  if (*outPtr > bound)
228  *outPtr -= 2 * bound;
229  if (*outPtr < -bound)
230  *outPtr += 2 * bound;
231  inPtr++;
232  outPtr++;
233 
234  for (number = 1; number < num_points; number++) {
235  *outPtr = *(inPtr) - *(inPtr - 1);
236  if (*outPtr > bound)
237  *outPtr -= 2 * bound;
238  if (*outPtr < -bound)
239  *outPtr += 2 * bound;
240  inPtr++;
241  outPtr++;
242  }
243 
244  *saveValue = inputVector[num_points - 1];
245 }
246 #endif /* LV_HAVE_GENERIC */
247 
248 
249 #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
250 
251 
252 #ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
253 #define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
254 
255 #include <inttypes.h>
256 #include <stdio.h>
257 
258 #ifdef LV_HAVE_AVX
259 #include <immintrin.h>
260 
261 static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
262  const float* inputVector,
263  const float bound,
264  float* saveValue,
265  unsigned int num_points)
266 {
267  if (num_points < 1) {
268  return;
269  }
270  unsigned int number = 1;
271  unsigned int j = 0;
272  // num_points-1 keeps Fedora 7's gcc from crashing...
273  // num_points won't work. :(
274  const unsigned int eighthPoints = (num_points - 1) / 8;
275 
276  float* outPtr = outputVector;
277  const float* inPtr = inputVector;
278  __m256 upperBound = _mm256_set1_ps(bound);
279  __m256 lowerBound = _mm256_set1_ps(-bound);
280  __m256 next3old1;
281  __m256 next4;
282  __m256 boundAdjust;
283  __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
284  __m256 negBoundAdjust = _mm256_set1_ps(2 * bound); // Add when we're below.
285  // Do the first 8 by hand since we're going in from the saveValue:
286  *outPtr = *inPtr - *saveValue;
287  if (*outPtr > bound)
288  *outPtr -= 2 * bound;
289  if (*outPtr < -bound)
290  *outPtr += 2 * bound;
291  inPtr++;
292  outPtr++;
293  for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
294  *outPtr = *(inPtr) - *(inPtr - 1);
295  if (*outPtr > bound)
296  *outPtr -= 2 * bound;
297  if (*outPtr < -bound)
298  *outPtr += 2 * bound;
299  inPtr++;
300  outPtr++;
301  }
302 
303  for (; number < eighthPoints; number++) {
304  // Load data
305  next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
306  next4 = _mm256_loadu_ps(inPtr);
307  inPtr += 8;
308  // Subtract and store:
309  next3old1 = _mm256_sub_ps(next4, next3old1);
310  // Bound:
311  boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
312  boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
313  next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
314  next4 = _mm256_and_ps(next4, negBoundAdjust);
315  boundAdjust = _mm256_or_ps(next4, boundAdjust);
316  // Make sure we're in the bounding interval:
317  next3old1 = _mm256_add_ps(next3old1, boundAdjust);
318  _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output
319  outPtr += 8;
320  }
321 
322  for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
323  number++) {
324  *outPtr = *(inPtr) - *(inPtr - 1);
325  if (*outPtr > bound)
326  *outPtr -= 2 * bound;
327  if (*outPtr < -bound)
328  *outPtr += 2 * bound;
329  inPtr++;
330  outPtr++;
331  }
332 
333  *saveValue = inputVector[num_points - 1];
334 }
335 #endif /* LV_HAVE_AVX */
336 
337 
338 #endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_s32f_32f_fm_detect_32f_a_avx(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:53
static void volk_32f_s32f_32f_fm_detect_32f_u_avx(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:261
static void volk_32f_s32f_32f_fm_detect_32f_a_sse(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:133
static void volk_32f_s32f_32f_fm_detect_32f_generic(float *outputVector, const float *inputVector, const float bound, float *saveValue, unsigned int num_points)
Definition: volk_32f_s32f_32f_fm_detect_32f.h:212