67 #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
68 #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
75 float32x4_t iValue, qValue, result;
76 iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]);
77 qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]);
78 result = vaddq_f32(iValue, qValue);
85 float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
86 sqrt_reciprocal = vmulq_f32(
87 vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
88 sqrt_reciprocal = vmulq_f32(
89 vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
91 return sqrt_reciprocal;
98 float32x4_t recip = vrecpeq_f32(x);
99 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
100 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
108 float32x4x2_t tmp_real;
109 float32x4x2_t tmp_imag;
114 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
116 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
119 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
121 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
123 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
124 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
131 float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
132 float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
133 float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
134 float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
135 float32x4_t x2 = vmulq_f32(x, x);
136 float32x4_t x4 = vmulq_f32(x2, x2);
137 float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
145 const float32x4_t log_tab[8] = {
146 vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
147 vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
148 vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
149 vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
152 const int32x4_t CONST_127 = vdupq_n_s32(127);
153 const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);
156 int32x4_t m = vsubq_s32(
157 vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
159 vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
165 poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
175 const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
176 const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
177 const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
178 const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
179 const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
180 const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
181 const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
182 const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
183 const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
184 const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516);
186 const float32x4_t CONST_1 = vdupq_n_f32(1.f);
187 const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
188 const float32x4_t CONST_0 = vdupq_n_f32(0.f);
189 const uint32x4_t CONST_2 = vdupq_n_u32(2);
190 const uint32x4_t CONST_4 = vdupq_n_u32(4);
194 uint32x4_t sign_mask_sin, sign_mask_cos;
195 sign_mask_sin = vcltq_f32(x, CONST_0);
198 float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
201 emm2 = vcvtq_u32_f32(y);
203 emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
204 emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
205 y = vcvtq_f32_u32(emm2);
211 const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
214 x = vmlaq_f32(x, y, c_minus_cephes_DP1);
215 x = vmlaq_f32(x, y, c_minus_cephes_DP2);
216 x = vmlaq_f32(x, y, c_minus_cephes_DP3);
218 sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
219 sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
224 float32x4_t z = vmulq_f32(x, x);
226 y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
227 y1 = vmlaq_f32(c_coscof_p2, z, y1);
228 y1 = vmulq_f32(y1, z);
229 y1 = vmulq_f32(y1, z);
230 y1 = vmlsq_f32(y1, z, CONST_1_2);
231 y1 = vaddq_f32(y1, CONST_1);
233 y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
234 y2 = vmlaq_f32(c_sincof_p2, z, y2);
235 y2 = vmulq_f32(y2, z);
236 y2 = vmlaq_f32(x, x, y2);
239 const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
240 const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
242 float32x4x2_t sincos;
243 sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
244 sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
252 return sincos.val[0];
258 return sincos.val[1];
264 return vmulq_f32(sincos.val[0],
_vinvq_f32(sincos.val[1]));
273 aux = vmulq_f32(aux,
val);
274 aux = vsubq_f32(aux, acc);
275 aux = vmulq_f32(aux, aux);
276 #ifdef LV_HAVE_NEONV8
277 return vfmaq_f32(sq_acc, aux, rec);
279 aux = vmulq_f32(aux, rec);
280 return vaddq_f32(sq_acc, aux);
val
Definition: volk_arch_defs.py:57
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:83
static float32x4_t _vinvq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:95
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:105
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:249
static float32x4_t _vlogq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:143
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:255
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:267
static float32x4x2_t _vsincosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:173
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:261
static float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
Definition: volk_neon_intrinsics.h:129