Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_tan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
61 #include <inttypes.h>
62 #include <math.h>
63 #include <stdio.h>
64 
65 #ifndef INCLUDED_volk_32f_tan_32f_a_H
66 #define INCLUDED_volk_32f_tan_32f_a_H
67 
68 #if LV_HAVE_AVX2 && LV_HAVE_FMA
69 #include <immintrin.h>
70 
71 static inline void
72 volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
73 {
74  float* bPtr = bVector;
75  const float* aPtr = aVector;
76 
77  unsigned int number = 0;
78  unsigned int eighthPoints = num_points / 8;
79  unsigned int i = 0;
80 
81  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
82  fzeroes;
83  __m256 sine, cosine, tangent, condition1, condition2, condition3;
84  __m256i q, r, ones, twos, fours;
85 
86  m4pi = _mm256_set1_ps(1.273239545);
87  pio4A = _mm256_set1_ps(0.78515625);
88  pio4B = _mm256_set1_ps(0.241876e-3);
89  ffours = _mm256_set1_ps(4.0);
90  ftwos = _mm256_set1_ps(2.0);
91  fones = _mm256_set1_ps(1.0);
92  fzeroes = _mm256_setzero_ps();
93  ones = _mm256_set1_epi32(1);
94  twos = _mm256_set1_epi32(2);
95  fours = _mm256_set1_epi32(4);
96 
97  cp1 = _mm256_set1_ps(1.0);
98  cp2 = _mm256_set1_ps(0.83333333e-1);
99  cp3 = _mm256_set1_ps(0.2777778e-2);
100  cp4 = _mm256_set1_ps(0.49603e-4);
101  cp5 = _mm256_set1_ps(0.551e-6);
102 
103  for (; number < eighthPoints; number++) {
104  aVal = _mm256_load_ps(aPtr);
105  s = _mm256_sub_ps(aVal,
106  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
107  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
108  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
109  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
110 
111  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
112  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
113 
114  s = _mm256_div_ps(
115  s,
116  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
117  s = _mm256_mul_ps(s, s);
118  // Evaluate Taylor series
119  s = _mm256_mul_ps(
120  _mm256_fmadd_ps(
121  _mm256_fmsub_ps(
122  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
123  s,
124  cp1),
125  s);
126 
127  for (i = 0; i < 3; i++) {
128  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
129  }
130  s = _mm256_div_ps(s, ftwos);
131 
132  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
133  cosine = _mm256_sub_ps(fones, s);
134 
135  condition1 = _mm256_cmp_ps(
136  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
137  fzeroes,
138  _CMP_NEQ_UQ);
139  condition2 = _mm256_cmp_ps(
140  _mm256_cmp_ps(
141  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
142  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
143  _CMP_NEQ_UQ);
144  condition3 = _mm256_cmp_ps(
145  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
146  fzeroes,
147  _CMP_NEQ_UQ);
148 
149  __m256 temp = cosine;
150  cosine =
151  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
152  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
153  sine = _mm256_sub_ps(
154  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
155  cosine = _mm256_sub_ps(
156  cosine,
157  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
158  tangent = _mm256_div_ps(sine, cosine);
159  _mm256_store_ps(bPtr, tangent);
160  aPtr += 8;
161  bPtr += 8;
162  }
163 
164  number = eighthPoints * 8;
165  for (; number < num_points; number++) {
166  *bPtr++ = tan(*aPtr++);
167  }
168 }
169 
170 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
171 
172 #ifdef LV_HAVE_AVX2
173 #include <immintrin.h>
174 
175 static inline void
176 volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
177 {
178  float* bPtr = bVector;
179  const float* aPtr = aVector;
180 
181  unsigned int number = 0;
182  unsigned int eighthPoints = num_points / 8;
183  unsigned int i = 0;
184 
185  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
186  fzeroes;
187  __m256 sine, cosine, tangent, condition1, condition2, condition3;
188  __m256i q, r, ones, twos, fours;
189 
190  m4pi = _mm256_set1_ps(1.273239545);
191  pio4A = _mm256_set1_ps(0.78515625);
192  pio4B = _mm256_set1_ps(0.241876e-3);
193  ffours = _mm256_set1_ps(4.0);
194  ftwos = _mm256_set1_ps(2.0);
195  fones = _mm256_set1_ps(1.0);
196  fzeroes = _mm256_setzero_ps();
197  ones = _mm256_set1_epi32(1);
198  twos = _mm256_set1_epi32(2);
199  fours = _mm256_set1_epi32(4);
200 
201  cp1 = _mm256_set1_ps(1.0);
202  cp2 = _mm256_set1_ps(0.83333333e-1);
203  cp3 = _mm256_set1_ps(0.2777778e-2);
204  cp4 = _mm256_set1_ps(0.49603e-4);
205  cp5 = _mm256_set1_ps(0.551e-6);
206 
207  for (; number < eighthPoints; number++) {
208  aVal = _mm256_load_ps(aPtr);
209  s = _mm256_sub_ps(aVal,
210  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
211  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
212  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
213  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
214 
215  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
216  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
217 
218  s = _mm256_div_ps(
219  s,
220  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
221  s = _mm256_mul_ps(s, s);
222  // Evaluate Taylor series
223  s = _mm256_mul_ps(
224  _mm256_add_ps(
225  _mm256_mul_ps(
226  _mm256_sub_ps(
227  _mm256_mul_ps(
228  _mm256_add_ps(
229  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
230  s),
231  cp3),
232  s),
233  cp2),
234  s),
235  cp1),
236  s);
237 
238  for (i = 0; i < 3; i++) {
239  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
240  }
241  s = _mm256_div_ps(s, ftwos);
242 
243  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
244  cosine = _mm256_sub_ps(fones, s);
245 
246  condition1 = _mm256_cmp_ps(
247  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
248  fzeroes,
249  _CMP_NEQ_UQ);
250  condition2 = _mm256_cmp_ps(
251  _mm256_cmp_ps(
252  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
253  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
254  _CMP_NEQ_UQ);
255  condition3 = _mm256_cmp_ps(
256  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
257  fzeroes,
258  _CMP_NEQ_UQ);
259 
260  __m256 temp = cosine;
261  cosine =
262  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
263  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
264  sine = _mm256_sub_ps(
265  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
266  cosine = _mm256_sub_ps(
267  cosine,
268  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
269  tangent = _mm256_div_ps(sine, cosine);
270  _mm256_store_ps(bPtr, tangent);
271  aPtr += 8;
272  bPtr += 8;
273  }
274 
275  number = eighthPoints * 8;
276  for (; number < num_points; number++) {
277  *bPtr++ = tan(*aPtr++);
278  }
279 }
280 
281 #endif /* LV_HAVE_AVX2 for aligned */
282 
283 #ifdef LV_HAVE_SSE4_1
284 #include <smmintrin.h>
285 
286 static inline void
287 volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
288 {
289  float* bPtr = bVector;
290  const float* aPtr = aVector;
291 
292  unsigned int number = 0;
293  unsigned int quarterPoints = num_points / 4;
294  unsigned int i = 0;
295 
296  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
297  fzeroes;
298  __m128 sine, cosine, tangent, condition1, condition2, condition3;
299  __m128i q, r, ones, twos, fours;
300 
301  m4pi = _mm_set1_ps(1.273239545);
302  pio4A = _mm_set1_ps(0.78515625);
303  pio4B = _mm_set1_ps(0.241876e-3);
304  ffours = _mm_set1_ps(4.0);
305  ftwos = _mm_set1_ps(2.0);
306  fones = _mm_set1_ps(1.0);
307  fzeroes = _mm_setzero_ps();
308  ones = _mm_set1_epi32(1);
309  twos = _mm_set1_epi32(2);
310  fours = _mm_set1_epi32(4);
311 
312  cp1 = _mm_set1_ps(1.0);
313  cp2 = _mm_set1_ps(0.83333333e-1);
314  cp3 = _mm_set1_ps(0.2777778e-2);
315  cp4 = _mm_set1_ps(0.49603e-4);
316  cp5 = _mm_set1_ps(0.551e-6);
317 
318  for (; number < quarterPoints; number++) {
319  aVal = _mm_load_ps(aPtr);
320  s = _mm_sub_ps(aVal,
321  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
322  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
323  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
324 
325  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
326  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
327 
328  s = _mm_div_ps(
329  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
330  s = _mm_mul_ps(s, s);
331  // Evaluate Taylor series
332  s = _mm_mul_ps(
333  _mm_add_ps(
334  _mm_mul_ps(
335  _mm_sub_ps(
336  _mm_mul_ps(
337  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
338  cp3),
339  s),
340  cp2),
341  s),
342  cp1),
343  s);
344 
345  for (i = 0; i < 3; i++) {
346  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
347  }
348  s = _mm_div_ps(s, ftwos);
349 
350  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
351  cosine = _mm_sub_ps(fones, s);
352 
353  condition1 = _mm_cmpneq_ps(
354  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
355  condition2 = _mm_cmpneq_ps(
356  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
357  _mm_cmplt_ps(aVal, fzeroes));
358  condition3 = _mm_cmpneq_ps(
359  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
360 
361  __m128 temp = cosine;
362  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
363  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
364  sine =
365  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
366  cosine = _mm_sub_ps(
367  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
368  tangent = _mm_div_ps(sine, cosine);
369  _mm_store_ps(bPtr, tangent);
370  aPtr += 4;
371  bPtr += 4;
372  }
373 
374  number = quarterPoints * 4;
375  for (; number < num_points; number++) {
376  *bPtr++ = tanf(*aPtr++);
377  }
378 }
379 
380 #endif /* LV_HAVE_SSE4_1 for aligned */
381 
382 
383 #endif /* INCLUDED_volk_32f_tan_32f_a_H */
384 
385 #ifndef INCLUDED_volk_32f_tan_32f_u_H
386 #define INCLUDED_volk_32f_tan_32f_u_H
387 
388 #if LV_HAVE_AVX2 && LV_HAVE_FMA
389 #include <immintrin.h>
390 
391 static inline void
392 volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
393 {
394  float* bPtr = bVector;
395  const float* aPtr = aVector;
396 
397  unsigned int number = 0;
398  unsigned int eighthPoints = num_points / 8;
399  unsigned int i = 0;
400 
401  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
402  fzeroes;
403  __m256 sine, cosine, tangent, condition1, condition2, condition3;
404  __m256i q, r, ones, twos, fours;
405 
406  m4pi = _mm256_set1_ps(1.273239545);
407  pio4A = _mm256_set1_ps(0.78515625);
408  pio4B = _mm256_set1_ps(0.241876e-3);
409  ffours = _mm256_set1_ps(4.0);
410  ftwos = _mm256_set1_ps(2.0);
411  fones = _mm256_set1_ps(1.0);
412  fzeroes = _mm256_setzero_ps();
413  ones = _mm256_set1_epi32(1);
414  twos = _mm256_set1_epi32(2);
415  fours = _mm256_set1_epi32(4);
416 
417  cp1 = _mm256_set1_ps(1.0);
418  cp2 = _mm256_set1_ps(0.83333333e-1);
419  cp3 = _mm256_set1_ps(0.2777778e-2);
420  cp4 = _mm256_set1_ps(0.49603e-4);
421  cp5 = _mm256_set1_ps(0.551e-6);
422 
423  for (; number < eighthPoints; number++) {
424  aVal = _mm256_loadu_ps(aPtr);
425  s = _mm256_sub_ps(aVal,
426  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
427  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
428  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
429  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
430 
431  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
432  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
433 
434  s = _mm256_div_ps(
435  s,
436  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
437  s = _mm256_mul_ps(s, s);
438  // Evaluate Taylor series
439  s = _mm256_mul_ps(
440  _mm256_fmadd_ps(
441  _mm256_fmsub_ps(
442  _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
443  s,
444  cp1),
445  s);
446 
447  for (i = 0; i < 3; i++) {
448  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
449  }
450  s = _mm256_div_ps(s, ftwos);
451 
452  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
453  cosine = _mm256_sub_ps(fones, s);
454 
455  condition1 = _mm256_cmp_ps(
456  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
457  fzeroes,
458  _CMP_NEQ_UQ);
459  condition2 = _mm256_cmp_ps(
460  _mm256_cmp_ps(
461  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
462  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
463  _CMP_NEQ_UQ);
464  condition3 = _mm256_cmp_ps(
465  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
466  fzeroes,
467  _CMP_NEQ_UQ);
468 
469  __m256 temp = cosine;
470  cosine =
471  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
472  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
473  sine = _mm256_sub_ps(
474  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
475  cosine = _mm256_sub_ps(
476  cosine,
477  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
478  tangent = _mm256_div_ps(sine, cosine);
479  _mm256_storeu_ps(bPtr, tangent);
480  aPtr += 8;
481  bPtr += 8;
482  }
483 
484  number = eighthPoints * 8;
485  for (; number < num_points; number++) {
486  *bPtr++ = tan(*aPtr++);
487  }
488 }
489 
490 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
491 
492 #ifdef LV_HAVE_AVX2
493 #include <immintrin.h>
494 
495 static inline void
496 volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
497 {
498  float* bPtr = bVector;
499  const float* aPtr = aVector;
500 
501  unsigned int number = 0;
502  unsigned int eighthPoints = num_points / 8;
503  unsigned int i = 0;
504 
505  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
506  fzeroes;
507  __m256 sine, cosine, tangent, condition1, condition2, condition3;
508  __m256i q, r, ones, twos, fours;
509 
510  m4pi = _mm256_set1_ps(1.273239545);
511  pio4A = _mm256_set1_ps(0.78515625);
512  pio4B = _mm256_set1_ps(0.241876e-3);
513  ffours = _mm256_set1_ps(4.0);
514  ftwos = _mm256_set1_ps(2.0);
515  fones = _mm256_set1_ps(1.0);
516  fzeroes = _mm256_setzero_ps();
517  ones = _mm256_set1_epi32(1);
518  twos = _mm256_set1_epi32(2);
519  fours = _mm256_set1_epi32(4);
520 
521  cp1 = _mm256_set1_ps(1.0);
522  cp2 = _mm256_set1_ps(0.83333333e-1);
523  cp3 = _mm256_set1_ps(0.2777778e-2);
524  cp4 = _mm256_set1_ps(0.49603e-4);
525  cp5 = _mm256_set1_ps(0.551e-6);
526 
527  for (; number < eighthPoints; number++) {
528  aVal = _mm256_loadu_ps(aPtr);
529  s = _mm256_sub_ps(aVal,
530  _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
531  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
532  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
533  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
534 
535  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
536  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
537 
538  s = _mm256_div_ps(
539  s,
540  _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
541  s = _mm256_mul_ps(s, s);
542  // Evaluate Taylor series
543  s = _mm256_mul_ps(
544  _mm256_add_ps(
545  _mm256_mul_ps(
546  _mm256_sub_ps(
547  _mm256_mul_ps(
548  _mm256_add_ps(
549  _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
550  s),
551  cp3),
552  s),
553  cp2),
554  s),
555  cp1),
556  s);
557 
558  for (i = 0; i < 3; i++) {
559  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
560  }
561  s = _mm256_div_ps(s, ftwos);
562 
563  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
564  cosine = _mm256_sub_ps(fones, s);
565 
566  condition1 = _mm256_cmp_ps(
567  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
568  fzeroes,
569  _CMP_NEQ_UQ);
570  condition2 = _mm256_cmp_ps(
571  _mm256_cmp_ps(
572  _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
573  _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
574  _CMP_NEQ_UQ);
575  condition3 = _mm256_cmp_ps(
576  _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
577  fzeroes,
578  _CMP_NEQ_UQ);
579 
580  __m256 temp = cosine;
581  cosine =
582  _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
583  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
584  sine = _mm256_sub_ps(
585  sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
586  cosine = _mm256_sub_ps(
587  cosine,
588  _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
589  tangent = _mm256_div_ps(sine, cosine);
590  _mm256_storeu_ps(bPtr, tangent);
591  aPtr += 8;
592  bPtr += 8;
593  }
594 
595  number = eighthPoints * 8;
596  for (; number < num_points; number++) {
597  *bPtr++ = tan(*aPtr++);
598  }
599 }
600 
601 #endif /* LV_HAVE_AVX2 for unaligned */
602 
603 
604 #ifdef LV_HAVE_SSE4_1
605 #include <smmintrin.h>
606 
607 static inline void
608 volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
609 {
610  float* bPtr = bVector;
611  const float* aPtr = aVector;
612 
613  unsigned int number = 0;
614  unsigned int quarterPoints = num_points / 4;
615  unsigned int i = 0;
616 
617  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
618  fzeroes;
619  __m128 sine, cosine, tangent, condition1, condition2, condition3;
620  __m128i q, r, ones, twos, fours;
621 
622  m4pi = _mm_set1_ps(1.273239545);
623  pio4A = _mm_set1_ps(0.78515625);
624  pio4B = _mm_set1_ps(0.241876e-3);
625  ffours = _mm_set1_ps(4.0);
626  ftwos = _mm_set1_ps(2.0);
627  fones = _mm_set1_ps(1.0);
628  fzeroes = _mm_setzero_ps();
629  ones = _mm_set1_epi32(1);
630  twos = _mm_set1_epi32(2);
631  fours = _mm_set1_epi32(4);
632 
633  cp1 = _mm_set1_ps(1.0);
634  cp2 = _mm_set1_ps(0.83333333e-1);
635  cp3 = _mm_set1_ps(0.2777778e-2);
636  cp4 = _mm_set1_ps(0.49603e-4);
637  cp5 = _mm_set1_ps(0.551e-6);
638 
639  for (; number < quarterPoints; number++) {
640  aVal = _mm_loadu_ps(aPtr);
641  s = _mm_sub_ps(aVal,
642  _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
643  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
644  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
645 
646  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
647  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
648 
649  s = _mm_div_ps(
650  s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
651  s = _mm_mul_ps(s, s);
652  // Evaluate Taylor series
653  s = _mm_mul_ps(
654  _mm_add_ps(
655  _mm_mul_ps(
656  _mm_sub_ps(
657  _mm_mul_ps(
658  _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
659  cp3),
660  s),
661  cp2),
662  s),
663  cp1),
664  s);
665 
666  for (i = 0; i < 3; i++) {
667  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
668  }
669  s = _mm_div_ps(s, ftwos);
670 
671  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
672  cosine = _mm_sub_ps(fones, s);
673 
674  condition1 = _mm_cmpneq_ps(
675  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
676  condition2 = _mm_cmpneq_ps(
677  _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
678  _mm_cmplt_ps(aVal, fzeroes));
679  condition3 = _mm_cmpneq_ps(
680  _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
681 
682  __m128 temp = cosine;
683  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
684  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
685  sine =
686  _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
687  cosine = _mm_sub_ps(
688  cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
689  tangent = _mm_div_ps(sine, cosine);
690  _mm_storeu_ps(bPtr, tangent);
691  aPtr += 4;
692  bPtr += 4;
693  }
694 
695  number = quarterPoints * 4;
696  for (; number < num_points; number++) {
697  *bPtr++ = tanf(*aPtr++);
698  }
699 }
700 
701 #endif /* LV_HAVE_SSE4_1 for unaligned */
702 
703 
704 #ifdef LV_HAVE_GENERIC
705 
706 static inline void
707 volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
708 {
709  float* bPtr = bVector;
710  const float* aPtr = aVector;
711  unsigned int number = 0;
712 
713  for (; number < num_points; number++) {
714  *bPtr++ = tanf(*aPtr++);
715  }
716 }
717 #endif /* LV_HAVE_GENERIC */
718 
719 
720 #ifdef LV_HAVE_NEON
721 #include <arm_neon.h>
723 
724 static inline void
725 volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
726 {
727  unsigned int number = 0;
728  unsigned int quarter_points = num_points / 4;
729  float* bVectorPtr = bVector;
730  const float* aVectorPtr = aVector;
731 
732  float32x4_t b_vec;
733  float32x4_t a_vec;
734 
735  for (number = 0; number < quarter_points; number++) {
736  a_vec = vld1q_f32(aVectorPtr);
737  // Prefetch next one, speeds things up
738  __VOLK_PREFETCH(aVectorPtr + 4);
739  b_vec = _vtanq_f32(a_vec);
740  vst1q_f32(bVectorPtr, b_vec);
741  // move pointers ahead
742  bVectorPtr += 4;
743  aVectorPtr += 4;
744  }
745 
746  // Deal with the rest
747  for (number = quarter_points * 4; number < num_points; number++) {
748  *bVectorPtr++ = tanf(*aVectorPtr++);
749  }
750 }
751 #endif /* LV_HAVE_NEON */
752 
753 
754 #endif /* INCLUDED_volk_32f_tan_32f_u_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2984
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128)
Definition: sse2neon.h:4036
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3128
FORCE_INLINE __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5212
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_floor_ps(__m128)
Definition: sse2neon.h:7781
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1205
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
int64x2_t __m128i
Definition: sse2neon.h:244
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
Definition: sse2neon.h:3937
static void volk_32f_tan_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:725
static void volk_32f_tan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:707
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:261