Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_cos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
59 #include <inttypes.h>
60 #include <math.h>
61 #include <stdio.h>
62 
63 #ifndef INCLUDED_volk_32f_cos_32f_a_H
64 #define INCLUDED_volk_32f_cos_32f_a_H
65 
66 #ifdef LV_HAVE_AVX512F
67 
68 #include <immintrin.h>
69 static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
70  const float* inVector,
71  unsigned int num_points)
72 {
73  float* cosPtr = cosVector;
74  const float* inPtr = inVector;
75 
76  unsigned int number = 0;
77  unsigned int sixteenPoints = num_points / 16;
78  unsigned int i = 0;
79 
80  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
81  fones, sine, cosine;
82  __m512i q, zeros, ones, twos, fours;
83 
84  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
85  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
86  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
87  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
88  ffours = _mm512_set1_ps(4.0);
89  ftwos = _mm512_set1_ps(2.0);
90  fones = _mm512_set1_ps(1.0);
91  zeros = _mm512_setzero_epi32();
92  ones = _mm512_set1_epi32(1);
93  twos = _mm512_set1_epi32(2);
94  fours = _mm512_set1_epi32(4);
95 
96  cp1 = _mm512_set1_ps(1.0);
97  cp2 = _mm512_set1_ps(0.08333333333333333);
98  cp3 = _mm512_set1_ps(0.002777777777777778);
99  cp4 = _mm512_set1_ps(4.96031746031746e-05);
100  cp5 = _mm512_set1_ps(5.511463844797178e-07);
101  __mmask16 condition1, condition2;
102 
103  for (; number < sixteenPoints; number++) {
104  aVal = _mm512_load_ps(inPtr);
105  // s = fabs(aVal)
106  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
107 
108  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
109  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
110  // r = q + q&1, q indicates quadrant, r gives
111  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
112 
113  s = _mm512_fnmadd_ps(r, pio4A, s);
114  s = _mm512_fnmadd_ps(r, pio4B, s);
115  s = _mm512_fnmadd_ps(r, pio4C, s);
116 
117  s = _mm512_div_ps(
118  s,
119  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
120  s = _mm512_mul_ps(s, s);
121  // Evaluate Taylor series
122  s = _mm512_mul_ps(
123  _mm512_fmadd_ps(
124  _mm512_fmsub_ps(
125  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
126  s,
127  cp1),
128  s);
129 
130  for (i = 0; i < 3; i++)
131  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
132  s = _mm512_div_ps(s, ftwos);
133 
134  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
135  cosine = _mm512_sub_ps(fones, s);
136 
137  // if(((q+1)&2) != 0) { cosine=sine;}
138  condition1 = _mm512_cmpneq_epi32_mask(
139  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
140 
141  // if(((q+2)&4) != 0) { cosine = -cosine;}
142  condition2 = _mm512_cmpneq_epi32_mask(
143  _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
144  cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
145  cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
146  _mm512_store_ps(cosPtr, cosine);
147  inPtr += 16;
148  cosPtr += 16;
149  }
150 
151  number = sixteenPoints * 16;
152  for (; number < num_points; number++) {
153  *cosPtr++ = cosf(*inPtr++);
154  }
155 }
156 #endif
157 
158 #if LV_HAVE_AVX2 && LV_HAVE_FMA
159 #include <immintrin.h>
160 
161 static inline void
162 volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
163 {
164  float* bPtr = bVector;
165  const float* aPtr = aVector;
166 
167  unsigned int number = 0;
168  unsigned int eighthPoints = num_points / 8;
169  unsigned int i = 0;
170 
171  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
172  fones, fzeroes;
173  __m256 sine, cosine;
174  __m256i q, ones, twos, fours;
175 
176  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
177  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
178  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
179  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
180  ffours = _mm256_set1_ps(4.0);
181  ftwos = _mm256_set1_ps(2.0);
182  fones = _mm256_set1_ps(1.0);
183  fzeroes = _mm256_setzero_ps();
184  __m256i zeroes = _mm256_set1_epi32(0);
185  ones = _mm256_set1_epi32(1);
186  __m256i allones = _mm256_set1_epi32(0xffffffff);
187  twos = _mm256_set1_epi32(2);
188  fours = _mm256_set1_epi32(4);
189 
190  cp1 = _mm256_set1_ps(1.0);
191  cp2 = _mm256_set1_ps(0.08333333333333333);
192  cp3 = _mm256_set1_ps(0.002777777777777778);
193  cp4 = _mm256_set1_ps(4.96031746031746e-05);
194  cp5 = _mm256_set1_ps(5.511463844797178e-07);
195  union bit256 condition1;
196  union bit256 condition3;
197 
198  for (; number < eighthPoints; number++) {
199 
200  aVal = _mm256_load_ps(aPtr);
201  // s = fabs(aVal)
202  s = _mm256_sub_ps(aVal,
203  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
204  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
205  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
206  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
207  // r = q + q&1, q indicates quadrant, r gives
208  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
209 
210  s = _mm256_fnmadd_ps(r, pio4A, s);
211  s = _mm256_fnmadd_ps(r, pio4B, s);
212  s = _mm256_fnmadd_ps(r, pio4C, s);
213 
214  s = _mm256_div_ps(
215  s,
216  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
217  s = _mm256_mul_ps(s, s);
218  // Evaluate Taylor series
219  s = _mm256_mul_ps(
220  _mm256_fmadd_ps(
221  _mm256_fmsub_ps(
222  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
223  s,
224  cp1),
225  s);
226 
227  for (i = 0; i < 3; i++)
228  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
229  s = _mm256_div_ps(s, ftwos);
230 
231  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
232  cosine = _mm256_sub_ps(fones, s);
233 
234  // if(((q+1)&2) != 0) { cosine=sine;}
235  condition1.int_vec =
236  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
237  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
238 
239  // if(((q+2)&4) != 0) { cosine = -cosine;}
240  condition3.int_vec = _mm256_cmpeq_epi32(
241  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
242  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
243 
244  cosine = _mm256_add_ps(
245  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
246  cosine = _mm256_sub_ps(cosine,
247  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
248  condition3.float_vec));
249  _mm256_store_ps(bPtr, cosine);
250  aPtr += 8;
251  bPtr += 8;
252  }
253 
254  number = eighthPoints * 8;
255  for (; number < num_points; number++) {
256  *bPtr++ = cos(*aPtr++);
257  }
258 }
259 
260 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
261 
262 #ifdef LV_HAVE_AVX2
263 #include <immintrin.h>
264 
265 static inline void
266 volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
267 {
268  float* bPtr = bVector;
269  const float* aPtr = aVector;
270 
271  unsigned int number = 0;
272  unsigned int eighthPoints = num_points / 8;
273  unsigned int i = 0;
274 
275  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
276  fones, fzeroes;
277  __m256 sine, cosine;
278  __m256i q, ones, twos, fours;
279 
280  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
281  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
282  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
283  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
284  ffours = _mm256_set1_ps(4.0);
285  ftwos = _mm256_set1_ps(2.0);
286  fones = _mm256_set1_ps(1.0);
287  fzeroes = _mm256_setzero_ps();
288  __m256i zeroes = _mm256_set1_epi32(0);
289  ones = _mm256_set1_epi32(1);
290  __m256i allones = _mm256_set1_epi32(0xffffffff);
291  twos = _mm256_set1_epi32(2);
292  fours = _mm256_set1_epi32(4);
293 
294  cp1 = _mm256_set1_ps(1.0);
295  cp2 = _mm256_set1_ps(0.08333333333333333);
296  cp3 = _mm256_set1_ps(0.002777777777777778);
297  cp4 = _mm256_set1_ps(4.96031746031746e-05);
298  cp5 = _mm256_set1_ps(5.511463844797178e-07);
299  union bit256 condition1;
300  union bit256 condition3;
301 
302  for (; number < eighthPoints; number++) {
303 
304  aVal = _mm256_load_ps(aPtr);
305  // s = fabs(aVal)
306  s = _mm256_sub_ps(aVal,
307  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
308  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
309  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
310  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
311  // r = q + q&1, q indicates quadrant, r gives
312  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
313 
314  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
315  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
316  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
317 
318  s = _mm256_div_ps(
319  s,
320  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
321  s = _mm256_mul_ps(s, s);
322  // Evaluate Taylor series
323  s = _mm256_mul_ps(
324  _mm256_add_ps(
325  _mm256_mul_ps(
326  _mm256_sub_ps(
327  _mm256_mul_ps(
328  _mm256_add_ps(
329  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
330  s),
331  cp3),
332  s),
333  cp2),
334  s),
335  cp1),
336  s);
337 
338  for (i = 0; i < 3; i++)
339  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
340  s = _mm256_div_ps(s, ftwos);
341 
342  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
343  cosine = _mm256_sub_ps(fones, s);
344 
345  // if(((q+1)&2) != 0) { cosine=sine;}
346  condition1.int_vec =
347  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
348  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
349 
350  // if(((q+2)&4) != 0) { cosine = -cosine;}
351  condition3.int_vec = _mm256_cmpeq_epi32(
352  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
353  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
354 
355  cosine = _mm256_add_ps(
356  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
357  cosine = _mm256_sub_ps(cosine,
358  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
359  condition3.float_vec));
360  _mm256_store_ps(bPtr, cosine);
361  aPtr += 8;
362  bPtr += 8;
363  }
364 
365  number = eighthPoints * 8;
366  for (; number < num_points; number++) {
367  *bPtr++ = cos(*aPtr++);
368  }
369 }
370 
371 #endif /* LV_HAVE_AVX2 for aligned */
372 
373 #ifdef LV_HAVE_SSE4_1
374 #include <smmintrin.h>
375 
376 static inline void
377 volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
378 {
379  float* bPtr = bVector;
380  const float* aPtr = aVector;
381 
382  unsigned int number = 0;
383  unsigned int quarterPoints = num_points / 4;
384  unsigned int i = 0;
385 
386  __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
387  fones, fzeroes;
388  __m128 sine, cosine;
389  __m128i q, ones, twos, fours;
390 
391  m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
392  pio4A = _mm_set1_ps(0.7853981554508209228515625);
393  pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
394  pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
395  ffours = _mm_set1_ps(4.0);
396  ftwos = _mm_set1_ps(2.0);
397  fones = _mm_set1_ps(1.0);
398  fzeroes = _mm_setzero_ps();
399  __m128i zeroes = _mm_set1_epi32(0);
400  ones = _mm_set1_epi32(1);
401  __m128i allones = _mm_set1_epi32(0xffffffff);
402  twos = _mm_set1_epi32(2);
403  fours = _mm_set1_epi32(4);
404 
405  cp1 = _mm_set1_ps(1.0);
406  cp2 = _mm_set1_ps(0.08333333333333333);
407  cp3 = _mm_set1_ps(0.002777777777777778);
408  cp4 = _mm_set1_ps(4.96031746031746e-05);
409  cp5 = _mm_set1_ps(5.511463844797178e-07);
410  union bit128 condition1;
411  union bit128 condition3;
412 
413  for (; number < quarterPoints; number++) {
414 
415  aVal = _mm_load_ps(aPtr);
416  // s = fabs(aVal)
417  s = _mm_sub_ps(aVal,
418  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
419  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
420  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
421  // r = q + q&1, q indicates quadrant, r gives
422  r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
423 
424  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
425  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
426  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
427 
428  s = _mm_div_ps(
429  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
430  s = _mm_mul_ps(s, s);
431  // Evaluate Taylor series
432  s = _mm_mul_ps(
433  _mm_add_ps(
434  _mm_mul_ps(
435  _mm_sub_ps(
436  _mm_mul_ps(
437  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
438  cp3),
439  s),
440  cp2),
441  s),
442  cp1),
443  s);
444 
445  for (i = 0; i < 3; i++)
446  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
447  s = _mm_div_ps(s, ftwos);
448 
449  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
450  cosine = _mm_sub_ps(fones, s);
451 
452  // if(((q+1)&2) != 0) { cosine=sine;}
453  condition1.int_vec =
454  _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
455  condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
456 
457  // if(((q+2)&4) != 0) { cosine = -cosine;}
458  condition3.int_vec =
459  _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
460  condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
461 
462  cosine = _mm_add_ps(cosine,
463  _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
464  cosine = _mm_sub_ps(
465  cosine,
466  _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
467  _mm_store_ps(bPtr, cosine);
468  aPtr += 4;
469  bPtr += 4;
470  }
471 
472  number = quarterPoints * 4;
473  for (; number < num_points; number++) {
474  *bPtr++ = cosf(*aPtr++);
475  }
476 }
477 
478 #endif /* LV_HAVE_SSE4_1 for aligned */
479 
480 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
481 
482 
483 #ifndef INCLUDED_volk_32f_cos_32f_u_H
484 #define INCLUDED_volk_32f_cos_32f_u_H
485 
486 #ifdef LV_HAVE_AVX512F
487 
488 #include <immintrin.h>
489 static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
490  const float* inVector,
491  unsigned int num_points)
492 {
493  float* cosPtr = cosVector;
494  const float* inPtr = inVector;
495 
496  unsigned int number = 0;
497  unsigned int sixteenPoints = num_points / 16;
498  unsigned int i = 0;
499 
500  __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
501  fones, sine, cosine;
502  __m512i q, zeros, ones, twos, fours;
503 
504  m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
505  pio4A = _mm512_set1_ps(0.7853981554508209228515625);
506  pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
507  pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
508  ffours = _mm512_set1_ps(4.0);
509  ftwos = _mm512_set1_ps(2.0);
510  fones = _mm512_set1_ps(1.0);
511  zeros = _mm512_setzero_epi32();
512  ones = _mm512_set1_epi32(1);
513  twos = _mm512_set1_epi32(2);
514  fours = _mm512_set1_epi32(4);
515 
516  cp1 = _mm512_set1_ps(1.0);
517  cp2 = _mm512_set1_ps(0.08333333333333333);
518  cp3 = _mm512_set1_ps(0.002777777777777778);
519  cp4 = _mm512_set1_ps(4.96031746031746e-05);
520  cp5 = _mm512_set1_ps(5.511463844797178e-07);
521  __mmask16 condition1, condition2;
522  for (; number < sixteenPoints; number++) {
523  aVal = _mm512_loadu_ps(inPtr);
524  // s = fabs(aVal)
525  s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
526 
527  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
528  q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
529  // r = q + q&1, q indicates quadrant, r gives
530  r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
531 
532  s = _mm512_fnmadd_ps(r, pio4A, s);
533  s = _mm512_fnmadd_ps(r, pio4B, s);
534  s = _mm512_fnmadd_ps(r, pio4C, s);
535 
536  s = _mm512_div_ps(
537  s,
538  _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
539  s = _mm512_mul_ps(s, s);
540  // Evaluate Taylor series
541  s = _mm512_mul_ps(
542  _mm512_fmadd_ps(
543  _mm512_fmsub_ps(
544  _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
545  s,
546  cp1),
547  s);
548 
549  for (i = 0; i < 3; i++)
550  s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
551  s = _mm512_div_ps(s, ftwos);
552 
553  sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
554  cosine = _mm512_sub_ps(fones, s);
555 
556  // if(((q+1)&2) != 0) { cosine=sine;}
557  condition1 = _mm512_cmpneq_epi32_mask(
558  _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
559 
560  // if(((q+2)&4) != 0) { cosine = -cosine;}
561  condition2 = _mm512_cmpneq_epi32_mask(
562  _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
563 
564  cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
565  cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
566  _mm512_storeu_ps(cosPtr, cosine);
567  inPtr += 16;
568  cosPtr += 16;
569  }
570 
571  number = sixteenPoints * 16;
572  for (; number < num_points; number++) {
573  *cosPtr++ = cosf(*inPtr++);
574  }
575 }
576 #endif
577 
578 #if LV_HAVE_AVX2 && LV_HAVE_FMA
579 #include <immintrin.h>
580 
581 static inline void
582 volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
583 {
584  float* bPtr = bVector;
585  const float* aPtr = aVector;
586 
587  unsigned int number = 0;
588  unsigned int eighthPoints = num_points / 8;
589  unsigned int i = 0;
590 
591  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
592  fones, fzeroes;
593  __m256 sine, cosine;
594  __m256i q, ones, twos, fours;
595 
596  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
597  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
598  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
599  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
600  ffours = _mm256_set1_ps(4.0);
601  ftwos = _mm256_set1_ps(2.0);
602  fones = _mm256_set1_ps(1.0);
603  fzeroes = _mm256_setzero_ps();
604  __m256i zeroes = _mm256_set1_epi32(0);
605  ones = _mm256_set1_epi32(1);
606  __m256i allones = _mm256_set1_epi32(0xffffffff);
607  twos = _mm256_set1_epi32(2);
608  fours = _mm256_set1_epi32(4);
609 
610  cp1 = _mm256_set1_ps(1.0);
611  cp2 = _mm256_set1_ps(0.08333333333333333);
612  cp3 = _mm256_set1_ps(0.002777777777777778);
613  cp4 = _mm256_set1_ps(4.96031746031746e-05);
614  cp5 = _mm256_set1_ps(5.511463844797178e-07);
615  union bit256 condition1;
616  union bit256 condition3;
617 
618  for (; number < eighthPoints; number++) {
619 
620  aVal = _mm256_loadu_ps(aPtr);
621  // s = fabs(aVal)
622  s = _mm256_sub_ps(aVal,
623  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
624  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
625  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
626  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
627  // r = q + q&1, q indicates quadrant, r gives
628  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
629 
630  s = _mm256_fnmadd_ps(r, pio4A, s);
631  s = _mm256_fnmadd_ps(r, pio4B, s);
632  s = _mm256_fnmadd_ps(r, pio4C, s);
633 
634  s = _mm256_div_ps(
635  s,
636  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
637  s = _mm256_mul_ps(s, s);
638  // Evaluate Taylor series
639  s = _mm256_mul_ps(
640  _mm256_fmadd_ps(
641  _mm256_fmsub_ps(
642  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
643  s,
644  cp1),
645  s);
646 
647  for (i = 0; i < 3; i++)
648  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
649  s = _mm256_div_ps(s, ftwos);
650 
651  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
652  cosine = _mm256_sub_ps(fones, s);
653 
654  // if(((q+1)&2) != 0) { cosine=sine;}
655  condition1.int_vec =
656  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
657  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
658 
659  // if(((q+2)&4) != 0) { cosine = -cosine;}
660  condition3.int_vec = _mm256_cmpeq_epi32(
661  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
662  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
663 
664  cosine = _mm256_add_ps(
665  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
666  cosine = _mm256_sub_ps(cosine,
667  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
668  condition3.float_vec));
669  _mm256_storeu_ps(bPtr, cosine);
670  aPtr += 8;
671  bPtr += 8;
672  }
673 
674  number = eighthPoints * 8;
675  for (; number < num_points; number++) {
676  *bPtr++ = cos(*aPtr++);
677  }
678 }
679 
680 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
681 
682 #ifdef LV_HAVE_AVX2
683 #include <immintrin.h>
684 
685 static inline void
686 volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
687 {
688  float* bPtr = bVector;
689  const float* aPtr = aVector;
690 
691  unsigned int number = 0;
692  unsigned int eighthPoints = num_points / 8;
693  unsigned int i = 0;
694 
695  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
696  fones, fzeroes;
697  __m256 sine, cosine;
698  __m256i q, ones, twos, fours;
699 
700  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
701  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
702  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
703  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
704  ffours = _mm256_set1_ps(4.0);
705  ftwos = _mm256_set1_ps(2.0);
706  fones = _mm256_set1_ps(1.0);
707  fzeroes = _mm256_setzero_ps();
708  __m256i zeroes = _mm256_set1_epi32(0);
709  ones = _mm256_set1_epi32(1);
710  __m256i allones = _mm256_set1_epi32(0xffffffff);
711  twos = _mm256_set1_epi32(2);
712  fours = _mm256_set1_epi32(4);
713 
714  cp1 = _mm256_set1_ps(1.0);
715  cp2 = _mm256_set1_ps(0.08333333333333333);
716  cp3 = _mm256_set1_ps(0.002777777777777778);
717  cp4 = _mm256_set1_ps(4.96031746031746e-05);
718  cp5 = _mm256_set1_ps(5.511463844797178e-07);
719  union bit256 condition1;
720  union bit256 condition3;
721 
722  for (; number < eighthPoints; number++) {
723 
724  aVal = _mm256_loadu_ps(aPtr);
725  // s = fabs(aVal)
726  s = _mm256_sub_ps(aVal,
727  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
728  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
729  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
730  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
731  // r = q + q&1, q indicates quadrant, r gives
732  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
733 
734  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
735  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
736  s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
737 
738  s = _mm256_div_ps(
739  s,
740  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
741  s = _mm256_mul_ps(s, s);
742  // Evaluate Taylor series
743  s = _mm256_mul_ps(
744  _mm256_add_ps(
745  _mm256_mul_ps(
746  _mm256_sub_ps(
747  _mm256_mul_ps(
748  _mm256_add_ps(
749  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
750  s),
751  cp3),
752  s),
753  cp2),
754  s),
755  cp1),
756  s);
757 
758  for (i = 0; i < 3; i++)
759  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
760  s = _mm256_div_ps(s, ftwos);
761 
762  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
763  cosine = _mm256_sub_ps(fones, s);
764 
765  // if(((q+1)&2) != 0) { cosine=sine;}
766  condition1.int_vec =
767  _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
768  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
769 
770  // if(((q+2)&4) != 0) { cosine = -cosine;}
771  condition3.int_vec = _mm256_cmpeq_epi32(
772  _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
773  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
774 
775  cosine = _mm256_add_ps(
776  cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
777  cosine = _mm256_sub_ps(cosine,
778  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
779  condition3.float_vec));
780  _mm256_storeu_ps(bPtr, cosine);
781  aPtr += 8;
782  bPtr += 8;
783  }
784 
785  number = eighthPoints * 8;
786  for (; number < num_points; number++) {
787  *bPtr++ = cos(*aPtr++);
788  }
789 }
790 
791 #endif /* LV_HAVE_AVX2 for unaligned */
792 
793 #ifdef LV_HAVE_SSE4_1
794 #include <smmintrin.h>
795 
796 static inline void
797 volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
798 {
799  float* bPtr = bVector;
800  const float* aPtr = aVector;
801 
802  unsigned int number = 0;
803  unsigned int quarterPoints = num_points / 4;
804  unsigned int i = 0;
805 
806  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
807  fzeroes;
808  __m128 sine, cosine, condition1, condition3;
809  __m128i q, r, ones, twos, fours;
810 
811  m4pi = _mm_set1_ps(1.273239545);
812  pio4A = _mm_set1_ps(0.78515625);
813  pio4B = _mm_set1_ps(0.241876e-3);
814  ffours = _mm_set1_ps(4.0);
815  ftwos = _mm_set1_ps(2.0);
816  fones = _mm_set1_ps(1.0);
817  fzeroes = _mm_setzero_ps();
818  ones = _mm_set1_epi32(1);
819  twos = _mm_set1_epi32(2);
820  fours = _mm_set1_epi32(4);
821 
822  cp1 = _mm_set1_ps(1.0);
823  cp2 = _mm_set1_ps(0.83333333e-1);
824  cp3 = _mm_set1_ps(0.2777778e-2);
825  cp4 = _mm_set1_ps(0.49603e-4);
826  cp5 = _mm_set1_ps(0.551e-6);
827 
828  for (; number < quarterPoints; number++) {
829  aVal = _mm_loadu_ps(aPtr);
830  s = _mm_sub_ps(aVal,
831  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
832  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
833  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
834 
835  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
836  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
837 
838  s = _mm_div_ps(
839  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
840  s = _mm_mul_ps(s, s);
841  // Evaluate Taylor series
842  s = _mm_mul_ps(
843  _mm_add_ps(
844  _mm_mul_ps(
845  _mm_sub_ps(
846  _mm_mul_ps(
847  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
848  cp3),
849  s),
850  cp2),
851  s),
852  cp1),
853  s);
854 
855  for (i = 0; i < 3; i++) {
856  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
857  }
858  s = _mm_div_ps(s, ftwos);
859 
860  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
861  cosine = _mm_sub_ps(fones, s);
862 
863  condition1 = _mm_cmpneq_ps(
864  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
865 
866  condition3 = _mm_cmpneq_ps(
867  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
868 
869  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
870  cosine = _mm_sub_ps(
871  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
872  _mm_storeu_ps(bPtr, cosine);
873  aPtr += 4;
874  bPtr += 4;
875  }
876 
877  number = quarterPoints * 4;
878  for (; number < num_points; number++) {
879  *bPtr++ = cosf(*aPtr++);
880  }
881 }
882 
883 #endif /* LV_HAVE_SSE4_1 for unaligned */
884 
885 
886 #ifdef LV_HAVE_GENERIC
887 
888 /*
889  * For derivation see
890  * Shibata, Naoki, "Efficient evaluation methods of elementary functions
891  * suitable for SIMD computation," in Springer-Verlag 2010
892  */
893 static inline void volk_32f_cos_32f_generic_fast(float* bVector,
894  const float* aVector,
895  unsigned int num_points)
896 {
897  float* bPtr = bVector;
898  const float* aPtr = aVector;
899 
900  float m4pi = 1.273239544735162542821171882678754627704620361328125;
901  float pio4A = 0.7853981554508209228515625;
902  float pio4B = 0.794662735614792836713604629039764404296875e-8;
903  float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
904  int N = 3; // order of argument reduction
905 
906  unsigned int number;
907  for (number = 0; number < num_points; number++) {
908  float s = fabs(*aPtr);
909  int q = (int)(s * m4pi);
910  int r = q + (q & 1);
911  s -= r * pio4A;
912  s -= r * pio4B;
913  s -= r * pio4C;
914 
915  s = s * 0.125; // 2^-N (<--3)
916  s = s * s;
917  s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
918  1.0) *
919  s;
920 
921  int i;
922  for (i = 0; i < N; ++i) {
923  s = (4.0 - s) * s;
924  }
925  s = s / 2.0;
926 
927  float sine = sqrt((2.0 - s) * s);
928  float cosine = 1 - s;
929 
930  if (((q + 1) & 2) != 0) {
931  s = cosine;
932  cosine = sine;
933  sine = s;
934  }
935  if (((q + 2) & 4) != 0) {
936  cosine = -cosine;
937  }
938  *bPtr = cosine;
939  bPtr++;
940  aPtr++;
941  }
942 }
943 
944 #endif /* LV_HAVE_GENERIC */
945 
946 
947 #ifdef LV_HAVE_GENERIC
948 
949 static inline void
950 volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
951 {
952  float* bPtr = bVector;
953  const float* aPtr = aVector;
954  unsigned int number = 0;
955 
956  for (; number < num_points; number++) {
957  *bPtr++ = cosf(*aPtr++);
958  }
959 }
960 
961 #endif /* LV_HAVE_GENERIC */
962 
963 
964 #ifdef LV_HAVE_NEON
965 #include <arm_neon.h>
967 
968 static inline void
969 volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
970 {
971  unsigned int number = 0;
972  unsigned int quarter_points = num_points / 4;
973  float* bVectorPtr = bVector;
974  const float* aVectorPtr = aVector;
975 
976  float32x4_t b_vec;
977  float32x4_t a_vec;
978 
979  for (number = 0; number < quarter_points; number++) {
980  a_vec = vld1q_f32(aVectorPtr);
981  // Prefetch next one, speeds things up
982  __VOLK_PREFETCH(aVectorPtr + 4);
983  b_vec = _vcosq_f32(a_vec);
984  vst1q_f32(bVectorPtr, b_vec);
985  // move pointers ahead
986  bVectorPtr += 4;
987  aVectorPtr += 4;
988  }
989 
990  // Deal with the rest
991  for (number = quarter_points * 4; number < num_points; number++) {
992  *bVectorPtr++ = cosf(*aVectorPtr++);
993  }
994 }
995 
996 #endif /* LV_HAVE_NEON */
997 
998 
999 #endif /* INCLUDED_volk_32f_cos_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition: sse2neon.h:3275
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6458
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7781
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
Definition: volk_common.h:120
Definition: volk_common.h:137
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:950
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:969
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:893
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:255