Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_neon_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * Copyright (c) 2016-2019 ARM Limited.
12  *
13  * SPDX-License-Identifier: MIT
14  *
15  * Permission is hereby granted, free of charge, to any person obtaining a copy
16  * of this software and associated documentation files (the "Software"), to
17  * deal in the Software without restriction, including without limitation the
18  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
19  * sell copies of the Software, and to permit persons to whom the Software is
20  * furnished to do so, subject to the following conditions:
21  *
22  * The above copyright notice and this permission notice shall be included in all
23  * copies or substantial portions of the Software.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  *
33  * _vtaylor_polyq_f32
34  * _vlogq_f32
35  *
36  */
37 
38 /* Copyright (C) 2011 Julien Pommier
39 
40  This software is provided 'as-is', without any express or implied
41  warranty. In no event will the authors be held liable for any damages
42  arising from the use of this software.
43 
44  Permission is granted to anyone to use this software for any purpose,
45  including commercial applications, and to alter it and redistribute it
46  freely, subject to the following restrictions:
47 
48  1. The origin of this software must not be misrepresented; you must not
49  claim that you wrote the original software. If you use this software
50  in a product, an acknowledgment in the product documentation would be
51  appreciated but is not required.
52  2. Altered source versions must be plainly marked as such, and must not be
53  misrepresented as being the original software.
54  3. This notice may not be removed or altered from any source distribution.
55 
56  (this is the zlib license)
57 
58  _vsincosq_f32
59 
60 */
61 
62 /*
63  * This file is intended to hold NEON intrinsics of intrinsics.
64  * They should be used in VOLK kernels to avoid copy-pasta.
65  */
66 
67 #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
68 #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
69 #include <arm_neon.h>
70 
71 
72 /* Magnitude squared for float32x4x2_t */
73 static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
74 {
75  float32x4_t iValue, qValue, result;
76  iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
77  qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
78  result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
79  return result;
80 }
81 
82 /* Inverse square root for float32x4_t */
83 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
84 {
85  float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
86  sqrt_reciprocal = vmulq_f32(
87  vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
88  sqrt_reciprocal = vmulq_f32(
89  vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
90 
91  return sqrt_reciprocal;
92 }
93 
94 /* Inverse */
95 static inline float32x4_t _vinvq_f32(float32x4_t x)
96 {
97  // Newton's method
98  float32x4_t recip = vrecpeq_f32(x);
99  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
100  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
101  return recip;
102 }
103 
104 /* Complex multiplication for float32x4x2_t */
105 static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
106  float32x4x2_t b_val)
107 {
108  float32x4x2_t tmp_real;
109  float32x4x2_t tmp_imag;
110  float32x4x2_t c_val;
111 
112  // multiply the real*real and imag*imag to get real result
113  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
114  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
115  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
116  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
117  // Multiply cross terms to get the imaginary result
118  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
119  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
120  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
121  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
122  // combine the products
123  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
124  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
125  return c_val;
126 }
127 
128 /* From ARM Compute Library, MIT license */
129 static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
130 {
131  float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
132  float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
133  float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
134  float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
135  float32x4_t x2 = vmulq_f32(x, x);
136  float32x4_t x4 = vmulq_f32(x2, x2);
137  float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
138  return res;
139 }
140 
141 /* Natural logarithm.
142  * From ARM Compute Library, MIT license */
143 static inline float32x4_t _vlogq_f32(float32x4_t x)
144 {
145  const float32x4_t log_tab[8] = {
146  vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
147  vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
148  vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
149  vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
150  };
151 
152  const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
153  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
154 
155  // Extract exponent
156  int32x4_t m = vsubq_s32(
157  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
158  float32x4_t val =
159  vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
160 
161  // Polynomial Approximation
162  float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
163 
164  // Reconstruct
165  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
166 
167  return poly;
168 }
169 
170 /* Evaluation of 4 sines & cosines at once.
171  * Optimized from here (zlib license)
172  * http://gruntthepeon.free.fr/ssemath/ */
173 static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
174 {
175  const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
176  const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
177  const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
178  const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
179  const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
180  const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
181  const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
182  const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
183  const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
184  const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
185 
186  const float32x4_t CONST_1 = vdupq_n_f32(1.f);
187  const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
188  const float32x4_t CONST_0 = vdupq_n_f32(0.f);
189  const uint32x4_t CONST_2 = vdupq_n_u32(2);
190  const uint32x4_t CONST_4 = vdupq_n_u32(4);
191 
192  uint32x4_t emm2;
193 
194  uint32x4_t sign_mask_sin, sign_mask_cos;
195  sign_mask_sin = vcltq_f32(x, CONST_0);
196  x = vabsq_f32(x);
197  // scale by 4/pi
198  float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
199 
200  // store the integer part of y in mm0
201  emm2 = vcvtq_u32_f32(y);
202  /* j=(j+1) & (~1) (see the cephes sources) */
203  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
204  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
205  y = vcvtq_f32_u32(emm2);
206 
207  /* get the polynom selection mask
208  there is one polynom for 0 <= x <= Pi/4
209  and another one for Pi/4<x<=Pi/2
210  Both branches will be computed. */
211  const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
212 
213  // The magic pass: "Extended precision modular arithmetic"
214  x = vmlaq_f32(x, y, c_minus_cephes_DP1);
215  x = vmlaq_f32(x, y, c_minus_cephes_DP2);
216  x = vmlaq_f32(x, y, c_minus_cephes_DP3);
217 
218  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
219  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
220 
221  /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
222  and the second polynom (Pi/4 <= x <= 0) in y2 */
223  float32x4_t y1, y2;
224  float32x4_t z = vmulq_f32(x, x);
225 
226  y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
227  y1 = vmlaq_f32(c_coscof_p2, z, y1);
228  y1 = vmulq_f32(y1, z);
229  y1 = vmulq_f32(y1, z);
230  y1 = vmlsq_f32(y1, z, CONST_1_2);
231  y1 = vaddq_f32(y1, CONST_1);
232 
233  y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
234  y2 = vmlaq_f32(c_sincof_p2, z, y2);
235  y2 = vmulq_f32(y2, z);
236  y2 = vmlaq_f32(x, x, y2);
237 
238  /* select the correct result from the two polynoms */
239  const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
240  const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
241 
242  float32x4x2_t sincos;
243  sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
244  sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
245 
246  return sincos;
247 }
248 
249 static inline float32x4_t _vsinq_f32(float32x4_t x)
250 {
251  const float32x4x2_t sincos = _vsincosq_f32(x);
252  return sincos.val[0];
253 }
254 
255 static inline float32x4_t _vcosq_f32(float32x4_t x)
256 {
257  const float32x4x2_t sincos = _vsincosq_f32(x);
258  return sincos.val[1];
259 }
260 
261 static inline float32x4_t _vtanq_f32(float32x4_t x)
262 {
263  const float32x4x2_t sincos = _vsincosq_f32(x);
264  return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
265 }
266 
267 static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
268  float32x4_t acc,
269  float32x4_t val,
270  float32x4_t rec,
271  float32x4_t aux)
272 {
273  aux = vmulq_f32(aux, val);
274  aux = vsubq_f32(aux, acc);
275  aux = vmulq_f32(aux, aux);
276 #ifdef LV_HAVE_NEONV8
277  return vfmaq_f32(sq_acc, aux, rec);
278 #else
279  aux = vmulq_f32(aux, rec);
280  return vaddq_f32(sq_acc, aux);
281 #endif
282 }
283 
284 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
val
Definition: volk_arch_defs.py:57
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:83
static float32x4_t _vinvq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:95
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:105
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:249
static float32x4_t _vlogq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:143
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:255
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:267
static float32x4x2_t _vsincosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:173
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:261
static float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
Definition: volk_neon_intrinsics.h:129