Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_sse3_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * This file is intended to hold SSE3 intrinsics of intrinsics.
12  * They should be used in VOLK kernels to avoid copy-pasta.
13  */
14 
15 #ifndef INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
16 #define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
17 #include <pmmintrin.h>
18 
20 {
21  __m128 yl, yh, tmp1, tmp2;
22  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
23  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
24  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
25  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
26  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
27  return _mm_addsub_ps(tmp1,
28  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
29 }
30 
32 {
33  const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
34  y = _mm_xor_ps(y, conjugator); // conjugate y
35  return _mm_complexmul_ps(x, y);
36 }
37 
38 static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
39 {
40  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
41  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
42  return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
43 }
44 
45 static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
46 {
47  return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
48 }
49 
50 static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0,
51  const __m128 symbols1,
52  const __m128 points0,
53  const __m128 points1,
54  const __m128 scalar)
55 {
56  /*
57  * Calculate: |y - x|^2 * SNR_lin
58  * Consider 'symbolsX' and 'pointsX' to be complex float
59  * 'symbolsX' are 'y' and 'pointsX' are 'x'
60  */
61  const __m128 diff0 = _mm_sub_ps(symbols0, points0);
62  const __m128 diff1 = _mm_sub_ps(symbols1, points1);
63  const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
64  return _mm_mul_ps(norms, scalar);
65 }
66 
67 #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2958
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6611
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2523
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6627
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static __m128 _mm_complexmul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:19
static __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:45
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:38
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse3_intrinsics.h:50
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31