Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_sse_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * This file is intended to hold SSE intrinsics of intrinsics.
12  * They should be used in VOLK kernels to avoid copy-pasta.
13  */
14 
15 #ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
17 #include <xmmintrin.h>
18 
19 static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
20 {
21  __m128 iValue, qValue;
22  // Arrange in i1i2i3i4 format
23  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
24  // Arrange in q1q2q3q4 format
25  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
26  iValue = _mm_mul_ps(iValue, iValue); // Square the I values
27  qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
28  return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
29 }
30 
31 static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
32 {
33  return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
34 }
35 
36 static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
37  const __m128 symbols1,
38  const __m128 points0,
39  const __m128 points1,
40  const __m128 scalar)
41 {
42  // calculate scalar * |x - y|^2
43  const __m128 diff0 = _mm_sub_ps(symbols0, points0);
44  const __m128 diff1 = _mm_sub_ps(symbols1, points1);
45  const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
46  return _mm_mul_ps(norms, scalar);
47 }
48 
50  __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
51 {
52  aux = _mm_mul_ps(aux, val);
53  aux = _mm_sub_ps(aux, acc);
54  aux = _mm_mul_ps(aux, aux);
55  aux = _mm_mul_ps(aux, rec);
56  return _mm_add_ps(sq_acc, aux);
57 }
58 
59 #endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
val
Definition: volk_arch_defs.py:57
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse_intrinsics.h:19
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:49
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:36
static __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse_intrinsics.h:31