Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
66 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
68 
69 #include <volk/volk_complex.h>
70 
71 
72 static inline void calculate_scaled_distances(float* target,
73  const lv_32fc_t symbol,
74  const lv_32fc_t* points,
75  const float scalar,
76  const unsigned int num_points)
77 {
78  lv_32fc_t diff;
79  for (unsigned int i = 0; i < num_points; ++i) {
80  /*
81  * Calculate: |y - x|^2 * SNR_lin
82  * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
83  */
84  diff = symbol - *points++;
85  *target++ =
86  scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
87  }
88 }
89 
90 
91 #ifdef LV_HAVE_AVX2
92 #include <immintrin.h>
94 
95 static inline void
96 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
97  lv_32fc_t* src0,
98  lv_32fc_t* points,
99  float scalar,
100  unsigned int num_points)
101 {
102  const unsigned int num_bytes = num_points * 8;
103  __m128 xmm9, xmm10;
104  __m256 xmm4, xmm6;
105  __m256 xmm_points0, xmm_points1, xmm_result;
106 
107  const unsigned int bound = num_bytes >> 6;
108 
109  // load complex value into all parts of the register.
110  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
111  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
112 
113  // Load scalar into all 8 parts of the register
114  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
116 
117  // Set permutation constant
118  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
119 
120  for (unsigned int i = 0; i < bound; ++i) {
121  xmm_points0 = _mm256_load_ps((float*)points);
122  xmm_points1 = _mm256_load_ps((float*)(points + 4));
123  points += 8;
124  __VOLK_PREFETCH(points);
125 
126  xmm_result = _mm256_scaled_norm_dist_ps_avx2(
127  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
128 
129  _mm256_store_ps(target, xmm_result);
130  target += 8;
131  }
132 
133  if (num_bytes >> 5 & 1) {
134  xmm_points0 = _mm256_load_ps((float*)points);
135 
136  xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
137 
138  points += 4;
139 
140  xmm6 = _mm256_mul_ps(xmm4, xmm4);
141 
142  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
144 
145  xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
146 
147  xmm9 = _mm256_extractf128_ps(xmm_result, 1);
148  _mm_store_ps(target, xmm9);
149  target += 4;
150  }
151 
152  if (num_bytes >> 4 & 1) {
153  xmm9 = _mm_load_ps((float*)points);
154 
155  xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
156 
157  points += 2;
158 
159  xmm9 = _mm_mul_ps(xmm10, xmm10);
160 
161  xmm10 = _mm_hadd_ps(xmm9, xmm9);
162 
163  xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
164 
165  _mm_storeh_pi((__m64*)target, xmm10);
166  target += 2;
167  }
168 
169  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
170 }
171 
172 #endif /*LV_HAVE_AVX2*/
173 
174 
175 #ifdef LV_HAVE_AVX
176 #include <immintrin.h>
178 
179 static inline void
181  lv_32fc_t* src0,
182  lv_32fc_t* points,
183  float scalar,
184  unsigned int num_points)
185 {
186  const int eightsPoints = num_points / 8;
187  const int remainder = num_points - 8 * eightsPoints;
188 
189  __m256 xmm_points0, xmm_points1, xmm_result;
190 
191  // load complex value into all parts of the register.
192  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
193 
194  // Load scalar into all 8 parts of the register
195  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
196 
197  for (int i = 0; i < eightsPoints; ++i) {
198  xmm_points0 = _mm256_load_ps((float*)points);
199  xmm_points1 = _mm256_load_ps((float*)(points + 4));
200  points += 8;
201 
202  xmm_result = _mm256_scaled_norm_dist_ps(
203  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
204 
205  _mm256_store_ps(target, xmm_result);
206  target += 8;
207  }
208 
209  const lv_32fc_t symbol = *src0;
210  calculate_scaled_distances(target, symbol, points, scalar, remainder);
211 }
212 
213 #endif /* LV_HAVE_AVX */
214 
215 
216 #ifdef LV_HAVE_SSE3
217 #include <pmmintrin.h>
219 
220 static inline void
222  lv_32fc_t* src0,
223  lv_32fc_t* points,
224  float scalar,
225  unsigned int num_points)
226 {
227  __m128 xmm_points0, xmm_points1, xmm_result;
228 
229  /*
230  * First do 4 values in every loop iteration.
231  * There may be up to 3 values left.
232  * leftovers0 indicates if at least 2 more are available for SSE execution.
233  * leftovers1 indicates if there is a single element left.
234  */
235  const int quarterPoints = num_points / 4;
236  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237  const int leftovers1 = num_points % 2;
238 
239  // load complex value into both parts of the register.
240  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
241 
242  // Load scalar into all 4 parts of the register
243  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
244 
245  for (int i = 0; i < quarterPoints; ++i) {
246  xmm_points0 = _mm_load_ps((float*)points);
247  xmm_points1 = _mm_load_ps((float*)(points + 2));
248  points += 4;
249  __VOLK_PREFETCH(points);
250  // calculate distances
251  xmm_result = _mm_scaled_norm_dist_ps_sse3(
252  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
253 
254  _mm_store_ps(target, xmm_result);
255  target += 4;
256  }
257 
258  for (int i = 0; i < leftovers0; ++i) {
259  xmm_points0 = _mm_load_ps((float*)points);
260  points += 2;
261 
262  xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
263  xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
264  xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
265  xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
266 
267  _mm_storeh_pi((__m64*)target, xmm_result);
268  target += 2;
269  }
270 
271  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
272 }
273 
274 #endif /*LV_HAVE_SSE3*/
275 
276 #ifdef LV_HAVE_SSE
278 #include <xmmintrin.h>
279 static inline void
281  lv_32fc_t* src0,
282  lv_32fc_t* points,
283  float scalar,
284  unsigned int num_points)
285 {
286  const __m128 xmm_scalar = _mm_set1_ps(scalar);
287  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
288 
289  for (unsigned i = 0; i < num_points / 4; ++i) {
290  __m128 xmm_points0 = _mm_load_ps((float*)points);
291  __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
292  points += 4;
293  __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
294  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
295  _mm_store_ps((float*)target, xmm_result);
296  target += 4;
297  }
298 
299  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
300 }
301 #endif // LV_HAVE_SSE
302 
303 #ifdef LV_HAVE_GENERIC
304 static inline void
306  lv_32fc_t* src0,
307  lv_32fc_t* points,
308  float scalar,
309  unsigned int num_points)
310 {
311  const lv_32fc_t symbol = *src0;
312  calculate_scaled_distances(target, symbol, points, scalar, num_points);
313 }
314 
315 #endif /*LV_HAVE_GENERIC*/
316 
317 
318 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
319 
320 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
321 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
322 
323 #include <volk/volk_complex.h>
324 
325 
326 #ifdef LV_HAVE_AVX2
327 #include <immintrin.h>
329 
330 static inline void
331 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
332  lv_32fc_t* src0,
333  lv_32fc_t* points,
334  float scalar,
335  unsigned int num_points)
336 {
337  const unsigned int num_bytes = num_points * 8;
338  __m128 xmm9, xmm10;
339  __m256 xmm4, xmm6;
340  __m256 xmm_points0, xmm_points1, xmm_result;
341 
342  const unsigned int bound = num_bytes >> 6;
343 
344  // load complex value into all parts of the register.
345  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
346  const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
347 
348  // Load scalar into all 8 parts of the register
349  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
350  const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
351 
352  // Set permutation constant
353  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
354 
355  for (unsigned int i = 0; i < bound; ++i) {
356  xmm_points0 = _mm256_loadu_ps((float*)points);
357  xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
358  points += 8;
359  __VOLK_PREFETCH(points);
360 
361  xmm_result = _mm256_scaled_norm_dist_ps_avx2(
362  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
363 
364  _mm256_storeu_ps(target, xmm_result);
365  target += 8;
366  }
367 
368  if (num_bytes >> 5 & 1) {
369  xmm_points0 = _mm256_loadu_ps((float*)points);
370 
371  xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
372 
373  points += 4;
374 
375  xmm6 = _mm256_mul_ps(xmm4, xmm4);
376 
377  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
378  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
379 
380  xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
381 
382  xmm9 = _mm256_extractf128_ps(xmm_result, 1);
383  _mm_storeu_ps(target, xmm9);
384  target += 4;
385  }
386 
387  if (num_bytes >> 4 & 1) {
388  xmm9 = _mm_loadu_ps((float*)points);
389 
390  xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
391 
392  points += 2;
393 
394  xmm9 = _mm_mul_ps(xmm10, xmm10);
395 
396  xmm10 = _mm_hadd_ps(xmm9, xmm9);
397 
398  xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
399 
400  _mm_storeh_pi((__m64*)target, xmm10);
401  target += 2;
402  }
403 
404  calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
405 }
406 
407 #endif /*LV_HAVE_AVX2*/
408 
409 
410 #ifdef LV_HAVE_AVX
411 #include <immintrin.h>
413 
414 static inline void
416  lv_32fc_t* src0,
417  lv_32fc_t* points,
418  float scalar,
419  unsigned int num_points)
420 {
421  const int eightsPoints = num_points / 8;
422  const int remainder = num_points - 8 * eightsPoints;
423 
424  __m256 xmm_points0, xmm_points1, xmm_result;
425 
426  // load complex value into all parts of the register.
427  const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
428 
429  // Load scalar into all 8 parts of the register
430  const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
431 
432  for (int i = 0; i < eightsPoints; ++i) {
433  xmm_points0 = _mm256_loadu_ps((float*)points);
434  xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
435  points += 8;
436 
437  xmm_result = _mm256_scaled_norm_dist_ps(
438  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
439 
440  _mm256_storeu_ps(target, xmm_result);
441  target += 8;
442  }
443 
444  const lv_32fc_t symbol = *src0;
445  calculate_scaled_distances(target, symbol, points, scalar, remainder);
446 }
447 
448 #endif /* LV_HAVE_AVX */
449 
450 
451 #ifdef LV_HAVE_SSE3
452 #include <pmmintrin.h>
454 
455 static inline void
457  lv_32fc_t* src0,
458  lv_32fc_t* points,
459  float scalar,
460  unsigned int num_points)
461 {
462  __m128 xmm_points0, xmm_points1, xmm_result;
463 
464  /*
465  * First do 4 values in every loop iteration.
466  * There may be up to 3 values left.
467  * leftovers0 indicates if at least 2 more are available for SSE execution.
468  * leftovers1 indicates if there is a single element left.
469  */
470  const int quarterPoints = num_points / 4;
471  const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
472  const int leftovers1 = num_points % 2;
473 
474  // load complex value into both parts of the register.
475  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
476 
477  // Load scalar into all 4 parts of the register
478  const __m128 xmm_scalar = _mm_load1_ps(&scalar);
479 
480  for (int i = 0; i < quarterPoints; ++i) {
481  xmm_points0 = _mm_loadu_ps((float*)points);
482  xmm_points1 = _mm_loadu_ps((float*)(points + 2));
483  points += 4;
484  __VOLK_PREFETCH(points);
485  // calculate distances
486  xmm_result = _mm_scaled_norm_dist_ps_sse3(
487  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
488 
489  _mm_storeu_ps(target, xmm_result);
490  target += 4;
491  }
492 
493  for (int i = 0; i < leftovers0; ++i) {
494  xmm_points0 = _mm_loadu_ps((float*)points);
495  points += 2;
496 
497  xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
498  xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
499  xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
500  xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
501 
502  _mm_storeh_pi((__m64*)target, xmm_result);
503  target += 2;
504  }
505 
506  calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
507 }
508 
509 #endif /*LV_HAVE_SSE3*/
510 
511 #ifdef LV_HAVE_SSE
513 #include <xmmintrin.h>
514 static inline void
516  lv_32fc_t* src0,
517  lv_32fc_t* points,
518  float scalar,
519  unsigned int num_points)
520 {
521  const __m128 xmm_scalar = _mm_set1_ps(scalar);
522  const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
523 
524  for (unsigned i = 0; i < num_points / 4; ++i) {
525  __m128 xmm_points0 = _mm_loadu_ps((float*)points);
526  __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
527  points += 4;
528  __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
529  xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
530  _mm_storeu_ps((float*)target, xmm_result);
531  target += 4;
532  }
533 
534  calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
535 }
536 #endif // LV_HAVE_SSE
537 
538 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
Definition: sse2neon.h:4483
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
int64x1_t __m64
Definition: sse2neon.h:234
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
Definition: sse2neon.h:3206
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
Definition: sse2neon.h:1885
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2751
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:180
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:415
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:515
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:221
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:72
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:280
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:305
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:456
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:92
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:75
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse3_intrinsics.h:50
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:36