Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_sin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
59 #include <inttypes.h>
60 #include <math.h>
61 #include <stdio.h>
62 
63 #ifndef INCLUDED_volk_32f_sin_32f_a_H
64 #define INCLUDED_volk_32f_sin_32f_a_H
65 #ifdef LV_HAVE_AVX512F
66 
67 #include <immintrin.h>
68 static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
69  const float* inVector,
70  unsigned int num_points)
71 {
72  float* sinPtr = sinVector;
73  const float* inPtr = inVector;
74 
75  unsigned int number = 0;
76  unsigned int sixteenPoints = num_points / 16;
77  unsigned int i = 0;
78 
79  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
80  fones;
81  __m512 sine, cosine;
82  __m512i q, zeros, ones, twos, fours;
83 
84  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88  ffours = _mm512_set1_ps(4.0);
89  ftwos = _mm512_set1_ps(2.0);
90  fones = _mm512_set1_ps(1.0);
91  zeros = _mm512_setzero_epi32();
92  ones = _mm512_set1_epi32(1);
93  twos = _mm512_set1_epi32(2);
94  fours = _mm512_set1_epi32(4);
95 
96  cp1 = _mm512_set1_ps(1.0);
97  cp2 = _mm512_set1_ps(0.08333333333333333);
98  cp3 = _mm512_set1_ps(0.002777777777777778);
99  cp4 = _mm512_set1_ps(4.96031746031746e-05);
100  cp5 = _mm512_set1_ps(5.511463844797178e-07);
101  __mmask16 condition1, condition2, ltZero;
102 
103  for (; number < sixteenPoints; number++) {
104  aVal = _mm512_load_ps(inPtr);
105  // s = fabs(aVal)
106  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107 
108  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110  // r = q + q&1, q indicates quadrant, r gives
111  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112 
113  s = _mm512_fnmadd_ps(r, pio4A, s);
114  s = _mm512_fnmadd_ps(r, pio4B, s);
115  s = _mm512_fnmadd_ps(r, pio4C, s);
116 
117  s = _mm512_div_ps(
118  s,
119  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120  s = _mm512_mul_ps(s, s);
121  // Evaluate Taylor series
122  s = _mm512_mul_ps(
123  _mm512_fmadd_ps(
124  _mm512_fmsub_ps(
125  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126  s,
127  cp1),
128  s);
129 
130  for (i = 0; i < 3; i++)
131  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132  s = _mm512_div_ps(s, ftwos);
133 
134  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
135  cosine = _mm512_sub_ps(fones, s);
136 
137  condition1 = _mm512_cmpneq_epi32_mask(
138  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
139  ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
140  condition2 = _mm512_kxor(
141  _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
142 
143  sine = _mm512_mask_blend_ps(condition1, sine, cosine);
144  sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
145  _mm512_store_ps(sinPtr, sine);
146  inPtr += 16;
147  sinPtr += 16;
148  }
149 
150  number = sixteenPoints * 16;
151  for (; number < num_points; number++) {
152  *sinPtr++ = sinf(*inPtr++);
153  }
154 }
155 #endif
156 #if LV_HAVE_AVX2 && LV_HAVE_FMA
157 #include <immintrin.h>
158 
159 static inline void
160 volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
161 {
162  float* bPtr = bVector;
163  const float* aPtr = aVector;
164 
165  unsigned int number = 0;
166  unsigned int eighthPoints = num_points / 8;
167  unsigned int i = 0;
168 
169  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
170  fzeroes;
171  __m256 sine, cosine, condition1, condition2;
172  __m256i q, r, ones, twos, fours;
173 
174  m4pi = _mm256_set1_ps(1.273239545);
175  pio4A = _mm256_set1_ps(0.78515625);
176  pio4B = _mm256_set1_ps(0.241876e-3);
177  ffours = _mm256_set1_ps(4.0);
178  ftwos = _mm256_set1_ps(2.0);
179  fones = _mm256_set1_ps(1.0);
180  fzeroes = _mm256_setzero_ps();
181  ones = _mm256_set1_epi32(1);
182  twos = _mm256_set1_epi32(2);
183  fours = _mm256_set1_epi32(4);
184 
185  cp1 = _mm256_set1_ps(1.0);
186  cp2 = _mm256_set1_ps(0.83333333e-1);
187  cp3 = _mm256_set1_ps(0.2777778e-2);
188  cp4 = _mm256_set1_ps(0.49603e-4);
189  cp5 = _mm256_set1_ps(0.551e-6);
190 
191  for (; number < eighthPoints; number++) {
192  aVal = _mm256_load_ps(aPtr);
193  s = _mm256_sub_ps(aVal,
194  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
195  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
196  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
197  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
198 
199  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
200  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
201 
202  s = _mm256_div_ps(
203  s,
204  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
205  s = _mm256_mul_ps(s, s);
206  // Evaluate Taylor series
207  s = _mm256_mul_ps(
208  _mm256_fmadd_ps(
209  _mm256_fmsub_ps(
210  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
211  s,
212  cp1),
213  s);
214 
215  for (i = 0; i < 3; i++) {
216  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
217  }
218  s = _mm256_div_ps(s, ftwos);
219 
220  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
221  cosine = _mm256_sub_ps(fones, s);
222 
223  condition1 = _mm256_cmp_ps(
224  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
225  fzeroes,
226  _CMP_NEQ_UQ);
227  condition2 = _mm256_cmp_ps(
228  _mm256_cmp_ps(
229  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
230  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
231  _CMP_NEQ_UQ);
232  // Need this condition only for cos
233  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
234  // twos), fours)), fzeroes);
235 
236  sine =
237  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
238  sine = _mm256_sub_ps(
239  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
240  _mm256_store_ps(bPtr, sine);
241  aPtr += 8;
242  bPtr += 8;
243  }
244 
245  number = eighthPoints * 8;
246  for (; number < num_points; number++) {
247  *bPtr++ = sin(*aPtr++);
248  }
249 }
250 
251 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
252 
253 #ifdef LV_HAVE_AVX2
254 #include <immintrin.h>
255 
256 static inline void
257 volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
258 {
259  float* bPtr = bVector;
260  const float* aPtr = aVector;
261 
262  unsigned int number = 0;
263  unsigned int eighthPoints = num_points / 8;
264  unsigned int i = 0;
265 
266  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
267  fzeroes;
268  __m256 sine, cosine, condition1, condition2;
269  __m256i q, r, ones, twos, fours;
270 
271  m4pi = _mm256_set1_ps(1.273239545);
272  pio4A = _mm256_set1_ps(0.78515625);
273  pio4B = _mm256_set1_ps(0.241876e-3);
274  ffours = _mm256_set1_ps(4.0);
275  ftwos = _mm256_set1_ps(2.0);
276  fones = _mm256_set1_ps(1.0);
277  fzeroes = _mm256_setzero_ps();
278  ones = _mm256_set1_epi32(1);
279  twos = _mm256_set1_epi32(2);
280  fours = _mm256_set1_epi32(4);
281 
282  cp1 = _mm256_set1_ps(1.0);
283  cp2 = _mm256_set1_ps(0.83333333e-1);
284  cp3 = _mm256_set1_ps(0.2777778e-2);
285  cp4 = _mm256_set1_ps(0.49603e-4);
286  cp5 = _mm256_set1_ps(0.551e-6);
287 
288  for (; number < eighthPoints; number++) {
289  aVal = _mm256_load_ps(aPtr);
290  s = _mm256_sub_ps(aVal,
291  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
292  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
293  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
294  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
295 
296  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
297  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
298 
299  s = _mm256_div_ps(
300  s,
301  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
302  s = _mm256_mul_ps(s, s);
303  // Evaluate Taylor series
304  s = _mm256_mul_ps(
305  _mm256_add_ps(
306  _mm256_mul_ps(
307  _mm256_sub_ps(
308  _mm256_mul_ps(
309  _mm256_add_ps(
310  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
311  s),
312  cp3),
313  s),
314  cp2),
315  s),
316  cp1),
317  s);
318 
319  for (i = 0; i < 3; i++) {
320  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
321  }
322  s = _mm256_div_ps(s, ftwos);
323 
324  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
325  cosine = _mm256_sub_ps(fones, s);
326 
327  condition1 = _mm256_cmp_ps(
328  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
329  fzeroes,
330  _CMP_NEQ_UQ);
331  condition2 = _mm256_cmp_ps(
332  _mm256_cmp_ps(
333  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
334  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
335  _CMP_NEQ_UQ);
336  // Need this condition only for cos
337  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
338  // twos), fours)), fzeroes);
339 
340  sine =
341  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
342  sine = _mm256_sub_ps(
343  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
344  _mm256_store_ps(bPtr, sine);
345  aPtr += 8;
346  bPtr += 8;
347  }
348 
349  number = eighthPoints * 8;
350  for (; number < num_points; number++) {
351  *bPtr++ = sin(*aPtr++);
352  }
353 }
354 
355 #endif /* LV_HAVE_AVX2 for aligned */
356 
357 #ifdef LV_HAVE_SSE4_1
358 #include <smmintrin.h>
359 
360 static inline void
361 volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
362 {
363  float* bPtr = bVector;
364  const float* aPtr = aVector;
365 
366  unsigned int number = 0;
367  unsigned int quarterPoints = num_points / 4;
368  unsigned int i = 0;
369 
370  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
371  fzeroes;
372  __m128 sine, cosine, condition1, condition2;
373  __m128i q, r, ones, twos, fours;
374 
375  m4pi = _mm_set1_ps(1.273239545);
376  pio4A = _mm_set1_ps(0.78515625);
377  pio4B = _mm_set1_ps(0.241876e-3);
378  ffours = _mm_set1_ps(4.0);
379  ftwos = _mm_set1_ps(2.0);
380  fones = _mm_set1_ps(1.0);
381  fzeroes = _mm_setzero_ps();
382  ones = _mm_set1_epi32(1);
383  twos = _mm_set1_epi32(2);
384  fours = _mm_set1_epi32(4);
385 
386  cp1 = _mm_set1_ps(1.0);
387  cp2 = _mm_set1_ps(0.83333333e-1);
388  cp3 = _mm_set1_ps(0.2777778e-2);
389  cp4 = _mm_set1_ps(0.49603e-4);
390  cp5 = _mm_set1_ps(0.551e-6);
391 
392  for (; number < quarterPoints; number++) {
393  aVal = _mm_load_ps(aPtr);
394  s = _mm_sub_ps(aVal,
395  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
396  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
397  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
398 
399  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
400  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
401 
402  s = _mm_div_ps(
403  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
404  s = _mm_mul_ps(s, s);
405  // Evaluate Taylor series
406  s = _mm_mul_ps(
407  _mm_add_ps(
408  _mm_mul_ps(
409  _mm_sub_ps(
410  _mm_mul_ps(
411  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
412  cp3),
413  s),
414  cp2),
415  s),
416  cp1),
417  s);
418 
419  for (i = 0; i < 3; i++) {
420  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
421  }
422  s = _mm_div_ps(s, ftwos);
423 
424  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
425  cosine = _mm_sub_ps(fones, s);
426 
427  condition1 = _mm_cmpneq_ps(
428  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
429  condition2 = _mm_cmpneq_ps(
430  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
431  _mm_cmplt_ps(aVal, fzeroes));
432  // Need this condition only for cos
433  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
434  // twos), fours)), fzeroes);
435 
436  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
437  sine =
438  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
439  _mm_store_ps(bPtr, sine);
440  aPtr += 4;
441  bPtr += 4;
442  }
443 
444  number = quarterPoints * 4;
445  for (; number < num_points; number++) {
446  *bPtr++ = sinf(*aPtr++);
447  }
448 }
449 
450 #endif /* LV_HAVE_SSE4_1 for aligned */
451 
452 
453 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
454 
455 #ifndef INCLUDED_volk_32f_sin_32f_u_H
456 #define INCLUDED_volk_32f_sin_32f_u_H
457 
458 #ifdef LV_HAVE_AVX512F
459 
460 #include <immintrin.h>
461 static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
462  const float* inVector,
463  unsigned int num_points)
464 {
465  float* sinPtr = sinVector;
466  const float* inPtr = inVector;
467 
468  unsigned int number = 0;
469  unsigned int sixteenPoints = num_points / 16;
470  unsigned int i = 0;
471 
472  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
473  fones;
474  __m512 sine, cosine;
475  __m512i q, zeros, ones, twos, fours;
476 
477  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
478  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
479  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
480  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
481  ffours = _mm512_set1_ps(4.0);
482  ftwos = _mm512_set1_ps(2.0);
483  fones = _mm512_set1_ps(1.0);
484  zeros = _mm512_setzero_epi32();
485  ones = _mm512_set1_epi32(1);
486  twos = _mm512_set1_epi32(2);
487  fours = _mm512_set1_epi32(4);
488 
489  cp1 = _mm512_set1_ps(1.0);
490  cp2 = _mm512_set1_ps(0.08333333333333333);
491  cp3 = _mm512_set1_ps(0.002777777777777778);
492  cp4 = _mm512_set1_ps(4.96031746031746e-05);
493  cp5 = _mm512_set1_ps(5.511463844797178e-07);
494  __mmask16 condition1, condition2, ltZero;
495 
496  for (; number < sixteenPoints; number++) {
497  aVal = _mm512_loadu_ps(inPtr);
498  // s = fabs(aVal)
499  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
500 
501  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
502  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
503  // r = q + q&1, q indicates quadrant, r gives
504  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
505 
506  s = _mm512_fnmadd_ps(r, pio4A, s);
507  s = _mm512_fnmadd_ps(r, pio4B, s);
508  s = _mm512_fnmadd_ps(r, pio4C, s);
509 
510  s = _mm512_div_ps(
511  s,
512  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
513  s = _mm512_mul_ps(s, s);
514  // Evaluate Taylor series
515  s = _mm512_mul_ps(
516  _mm512_fmadd_ps(
517  _mm512_fmsub_ps(
518  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
519  s,
520  cp1),
521  s);
522 
523  for (i = 0; i < 3; i++)
524  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
525  s = _mm512_div_ps(s, ftwos);
526 
527  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
528  cosine = _mm512_sub_ps(fones, s);
529 
530  condition1 = _mm512_cmpneq_epi32_mask(
531  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
532  ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
533  condition2 = _mm512_kxor(
534  _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
535 
536  sine = _mm512_mask_blend_ps(condition1, sine, cosine);
537  sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
538  _mm512_storeu_ps(sinPtr, sine);
539  inPtr += 16;
540  sinPtr += 16;
541  }
542 
543  number = sixteenPoints * 16;
544  for (; number < num_points; number++) {
545  *sinPtr++ = sinf(*inPtr++);
546  }
547 }
548 #endif
549 
550 #if LV_HAVE_AVX2 && LV_HAVE_FMA
551 #include <immintrin.h>
552 
553 static inline void
554 volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
555 {
556  float* bPtr = bVector;
557  const float* aPtr = aVector;
558 
559  unsigned int number = 0;
560  unsigned int eighthPoints = num_points / 8;
561  unsigned int i = 0;
562 
563  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
564  fzeroes;
565  __m256 sine, cosine, condition1, condition2;
566  __m256i q, r, ones, twos, fours;
567 
568  m4pi = _mm256_set1_ps(1.273239545);
569  pio4A = _mm256_set1_ps(0.78515625);
570  pio4B = _mm256_set1_ps(0.241876e-3);
571  ffours = _mm256_set1_ps(4.0);
572  ftwos = _mm256_set1_ps(2.0);
573  fones = _mm256_set1_ps(1.0);
574  fzeroes = _mm256_setzero_ps();
575  ones = _mm256_set1_epi32(1);
576  twos = _mm256_set1_epi32(2);
577  fours = _mm256_set1_epi32(4);
578 
579  cp1 = _mm256_set1_ps(1.0);
580  cp2 = _mm256_set1_ps(0.83333333e-1);
581  cp3 = _mm256_set1_ps(0.2777778e-2);
582  cp4 = _mm256_set1_ps(0.49603e-4);
583  cp5 = _mm256_set1_ps(0.551e-6);
584 
585  for (; number < eighthPoints; number++) {
586  aVal = _mm256_loadu_ps(aPtr);
587  s = _mm256_sub_ps(aVal,
588  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
589  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
590  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
591  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
592 
593  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
594  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
595 
596  s = _mm256_div_ps(
597  s,
598  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
599  s = _mm256_mul_ps(s, s);
600  // Evaluate Taylor series
601  s = _mm256_mul_ps(
602  _mm256_fmadd_ps(
603  _mm256_fmsub_ps(
604  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
605  s,
606  cp1),
607  s);
608 
609  for (i = 0; i < 3; i++) {
610  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
611  }
612  s = _mm256_div_ps(s, ftwos);
613 
614  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
615  cosine = _mm256_sub_ps(fones, s);
616 
617  condition1 = _mm256_cmp_ps(
618  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
619  fzeroes,
620  _CMP_NEQ_UQ);
621  condition2 = _mm256_cmp_ps(
622  _mm256_cmp_ps(
623  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
624  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
625  _CMP_NEQ_UQ);
626  // Need this condition only for cos
627  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
628  // twos), fours)), fzeroes);
629 
630  sine =
631  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
632  sine = _mm256_sub_ps(
633  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
634  _mm256_storeu_ps(bPtr, sine);
635  aPtr += 8;
636  bPtr += 8;
637  }
638 
639  number = eighthPoints * 8;
640  for (; number < num_points; number++) {
641  *bPtr++ = sin(*aPtr++);
642  }
643 }
644 
645 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
646 
647 #ifdef LV_HAVE_AVX2
648 #include <immintrin.h>
649 
650 static inline void
651 volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
652 {
653  float* bPtr = bVector;
654  const float* aPtr = aVector;
655 
656  unsigned int number = 0;
657  unsigned int eighthPoints = num_points / 8;
658  unsigned int i = 0;
659 
660  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
661  fzeroes;
662  __m256 sine, cosine, condition1, condition2;
663  __m256i q, r, ones, twos, fours;
664 
665  m4pi = _mm256_set1_ps(1.273239545);
666  pio4A = _mm256_set1_ps(0.78515625);
667  pio4B = _mm256_set1_ps(0.241876e-3);
668  ffours = _mm256_set1_ps(4.0);
669  ftwos = _mm256_set1_ps(2.0);
670  fones = _mm256_set1_ps(1.0);
671  fzeroes = _mm256_setzero_ps();
672  ones = _mm256_set1_epi32(1);
673  twos = _mm256_set1_epi32(2);
674  fours = _mm256_set1_epi32(4);
675 
676  cp1 = _mm256_set1_ps(1.0);
677  cp2 = _mm256_set1_ps(0.83333333e-1);
678  cp3 = _mm256_set1_ps(0.2777778e-2);
679  cp4 = _mm256_set1_ps(0.49603e-4);
680  cp5 = _mm256_set1_ps(0.551e-6);
681 
682  for (; number < eighthPoints; number++) {
683  aVal = _mm256_loadu_ps(aPtr);
684  s = _mm256_sub_ps(aVal,
685  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
686  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
687  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
688  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
689 
690  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
691  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
692 
693  s = _mm256_div_ps(
694  s,
695  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
696  s = _mm256_mul_ps(s, s);
697  // Evaluate Taylor series
698  s = _mm256_mul_ps(
699  _mm256_add_ps(
700  _mm256_mul_ps(
701  _mm256_sub_ps(
702  _mm256_mul_ps(
703  _mm256_add_ps(
704  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
705  s),
706  cp3),
707  s),
708  cp2),
709  s),
710  cp1),
711  s);
712 
713  for (i = 0; i < 3; i++) {
714  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
715  }
716  s = _mm256_div_ps(s, ftwos);
717 
718  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
719  cosine = _mm256_sub_ps(fones, s);
720 
721  condition1 = _mm256_cmp_ps(
722  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
723  fzeroes,
724  _CMP_NEQ_UQ);
725  condition2 = _mm256_cmp_ps(
726  _mm256_cmp_ps(
727  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
728  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
729  _CMP_NEQ_UQ);
730  // Need this condition only for cos
731  // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
732  // twos), fours)), fzeroes);
733 
734  sine =
735  _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
736  sine = _mm256_sub_ps(
737  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
738  _mm256_storeu_ps(bPtr, sine);
739  aPtr += 8;
740  bPtr += 8;
741  }
742 
743  number = eighthPoints * 8;
744  for (; number < num_points; number++) {
745  *bPtr++ = sin(*aPtr++);
746  }
747 }
748 
749 #endif /* LV_HAVE_AVX2 for unaligned */
750 
751 
752 #ifdef LV_HAVE_SSE4_1
753 #include <smmintrin.h>
754 
755 static inline void
756 volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
757 {
758  float* bPtr = bVector;
759  const float* aPtr = aVector;
760 
761  unsigned int number = 0;
762  unsigned int quarterPoints = num_points / 4;
763  unsigned int i = 0;
764 
765  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
766  fzeroes;
767  __m128 sine, cosine, condition1, condition2;
768  __m128i q, r, ones, twos, fours;
769 
770  m4pi = _mm_set1_ps(1.273239545);
771  pio4A = _mm_set1_ps(0.78515625);
772  pio4B = _mm_set1_ps(0.241876e-3);
773  ffours = _mm_set1_ps(4.0);
774  ftwos = _mm_set1_ps(2.0);
775  fones = _mm_set1_ps(1.0);
776  fzeroes = _mm_setzero_ps();
777  ones = _mm_set1_epi32(1);
778  twos = _mm_set1_epi32(2);
779  fours = _mm_set1_epi32(4);
780 
781  cp1 = _mm_set1_ps(1.0);
782  cp2 = _mm_set1_ps(0.83333333e-1);
783  cp3 = _mm_set1_ps(0.2777778e-2);
784  cp4 = _mm_set1_ps(0.49603e-4);
785  cp5 = _mm_set1_ps(0.551e-6);
786 
787  for (; number < quarterPoints; number++) {
788  aVal = _mm_loadu_ps(aPtr);
789  s = _mm_sub_ps(aVal,
790  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
791  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
792  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
793 
794  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
795  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
796 
797  s = _mm_div_ps(
798  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
799  s = _mm_mul_ps(s, s);
800  // Evaluate Taylor series
801  s = _mm_mul_ps(
802  _mm_add_ps(
803  _mm_mul_ps(
804  _mm_sub_ps(
805  _mm_mul_ps(
806  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
807  cp3),
808  s),
809  cp2),
810  s),
811  cp1),
812  s);
813 
814  for (i = 0; i < 3; i++) {
815  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
816  }
817  s = _mm_div_ps(s, ftwos);
818 
819  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
820  cosine = _mm_sub_ps(fones, s);
821 
822  condition1 = _mm_cmpneq_ps(
823  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
824  condition2 = _mm_cmpneq_ps(
825  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
826  _mm_cmplt_ps(aVal, fzeroes));
827 
828  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
829  sine =
830  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
831  _mm_storeu_ps(bPtr, sine);
832  aPtr += 4;
833  bPtr += 4;
834  }
835 
836  number = quarterPoints * 4;
837  for (; number < num_points; number++) {
838  *bPtr++ = sinf(*aPtr++);
839  }
840 }
841 
842 #endif /* LV_HAVE_SSE4_1 for unaligned */
843 
844 
845 #ifdef LV_HAVE_GENERIC
846 
847 static inline void
848 volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
849 {
850  float* bPtr = bVector;
851  const float* aPtr = aVector;
852  unsigned int number = 0;
853 
854  for (number = 0; number < num_points; number++) {
855  *bPtr++ = sinf(*aPtr++);
856  }
857 }
858 
859 #endif /* LV_HAVE_GENERIC */
860 
861 
862 #ifdef LV_HAVE_NEON
863 #include <arm_neon.h>
865 
866 static inline void
867 volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
868 {
869  unsigned int number = 0;
870  unsigned int quarter_points = num_points / 4;
871  float* bVectorPtr = bVector;
872  const float* aVectorPtr = aVector;
873 
874  float32x4_t b_vec;
875  float32x4_t a_vec;
876 
877  for (number = 0; number < quarter_points; number++) {
878  a_vec = vld1q_f32(aVectorPtr);
879  // Prefetch next one, speeds things up
880  __VOLK_PREFETCH(aVectorPtr + 4);
881  b_vec = _vsinq_f32(a_vec);
882  vst1q_f32(bVectorPtr, b_vec);
883  // move pointers ahead
884  bVectorPtr += 4;
885  aVectorPtr += 4;
886  }
887 
888  // Deal with the rest
889  for (number = quarter_points * 4; number < num_points; number++) {
890  *bVectorPtr++ = sinf(*aVectorPtr++);
891  }
892 }
893 
894 #endif /* LV_HAVE_NEON */
895 
896 
897 #endif /* INCLUDED_volk_32f_sin_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7781
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:848
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:867
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:249