66 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
76 const unsigned int num_points)
79 for (
unsigned int i = 0;
i < num_points; ++
i) {
84 diff = symbol - *points++;
92 #include <immintrin.h>
96 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(
float* target,
100 unsigned int num_points)
102 const unsigned int num_bytes = num_points * 8;
105 __m256 xmm_points0, xmm_points1, xmm_result;
107 const unsigned int bound = num_bytes >> 6;
110 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
111 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
114 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
118 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
120 for (
unsigned int i = 0;
i < bound; ++
i) {
121 xmm_points0 = _mm256_load_ps((
float*)points);
122 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
127 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
129 _mm256_store_ps(target, xmm_result);
133 if (num_bytes >> 5 & 1) {
134 xmm_points0 = _mm256_load_ps((
float*)points);
136 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
145 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
147 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
152 if (num_bytes >> 4 & 1) {
176 #include <immintrin.h>
184 unsigned int num_points)
186 const int eightsPoints = num_points / 8;
187 const int remainder = num_points - 8 * eightsPoints;
189 __m256 xmm_points0, xmm_points1, xmm_result;
192 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
195 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
197 for (
int i = 0;
i < eightsPoints; ++
i) {
198 xmm_points0 = _mm256_load_ps((
float*)points);
199 xmm_points1 = _mm256_load_ps((
float*)(points + 4));
203 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
205 _mm256_store_ps(target, xmm_result);
217 #include <pmmintrin.h>
225 unsigned int num_points)
227 __m128 xmm_points0, xmm_points1, xmm_result;
235 const int quarterPoints = num_points / 4;
236 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237 const int leftovers1 = num_points % 2;
245 for (
int i = 0;
i < quarterPoints; ++
i) {
252 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
258 for (
int i = 0;
i < leftovers0; ++
i) {
262 xmm_points0 =
_mm_sub_ps(xmm_symbol, xmm_points0);
263 xmm_points0 =
_mm_mul_ps(xmm_points0, xmm_points0);
264 xmm_points0 =
_mm_hadd_ps(xmm_points0, xmm_points0);
265 xmm_result =
_mm_mul_ps(xmm_points0, xmm_scalar);
278 #include <xmmintrin.h>
284 unsigned int num_points)
289 for (
unsigned i = 0;
i < num_points / 4; ++
i) {
294 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
303 #ifdef LV_HAVE_GENERIC
309 unsigned int num_points)
320 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
321 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
327 #include <immintrin.h>
331 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(
float* target,
335 unsigned int num_points)
337 const unsigned int num_bytes = num_points * 8;
340 __m256 xmm_points0, xmm_points1, xmm_result;
342 const unsigned int bound = num_bytes >> 6;
345 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
346 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
349 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
350 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
353 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
355 for (
unsigned int i = 0;
i < bound; ++
i) {
356 xmm_points0 = _mm256_loadu_ps((
float*)points);
357 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
362 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
364 _mm256_storeu_ps(target, xmm_result);
368 if (num_bytes >> 5 & 1) {
369 xmm_points0 = _mm256_loadu_ps((
float*)points);
371 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
375 xmm6 = _mm256_mul_ps(xmm4, xmm4);
377 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
380 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
382 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
387 if (num_bytes >> 4 & 1) {
411 #include <immintrin.h>
419 unsigned int num_points)
421 const int eightsPoints = num_points / 8;
422 const int remainder = num_points - 8 * eightsPoints;
424 __m256 xmm_points0, xmm_points1, xmm_result;
427 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((
const double*)src0));
430 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
432 for (
int i = 0;
i < eightsPoints; ++
i) {
433 xmm_points0 = _mm256_loadu_ps((
float*)points);
434 xmm_points1 = _mm256_loadu_ps((
float*)(points + 4));
438 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
440 _mm256_storeu_ps(target, xmm_result);
452 #include <pmmintrin.h>
460 unsigned int num_points)
462 __m128 xmm_points0, xmm_points1, xmm_result;
470 const int quarterPoints = num_points / 4;
471 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
472 const int leftovers1 = num_points % 2;
480 for (
int i = 0;
i < quarterPoints; ++
i) {
487 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
493 for (
int i = 0;
i < leftovers0; ++
i) {
497 xmm_points0 =
_mm_sub_ps(xmm_symbol, xmm_points0);
498 xmm_points0 =
_mm_mul_ps(xmm_points0, xmm_points0);
499 xmm_points0 =
_mm_hadd_ps(xmm_points0, xmm_points0);
500 xmm_result =
_mm_mul_ps(xmm_points0, xmm_scalar);
513 #include <xmmintrin.h>
519 unsigned int num_points)
524 for (
unsigned i = 0;
i < num_points / 4; ++
i) {
529 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
Definition: sse2neon.h:4483
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
int64x1_t __m64
Definition: sse2neon.h:234
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
Definition: sse2neon.h:3206
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2751
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:180
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:415
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:515
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:221
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:72
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:280
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:305
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:456
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:92
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:75
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse3_intrinsics.h:50
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:36