Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
65 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
66 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
67 
68 #include <inttypes.h>
69 #include <stdio.h>
70 #include <volk/volk_complex.h>
71 
72 #ifdef LV_HAVE_AVX2
73 #include <immintrin.h>
74 
75 static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
76  lv_32fc_t* src0,
77  lv_32fc_t* points,
78  unsigned int num_points)
79 {
80  const unsigned int num_bytes = num_points * 8;
81  __m128 xmm0, xmm9, xmm10;
82  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
83 
84  lv_32fc_t diff;
85  float sq_dist;
86  int bound = num_bytes >> 6;
87  int leftovers0 = (num_bytes >> 5) & 1;
88  int leftovers1 = (num_bytes >> 4) & 1;
89  int leftovers2 = (num_bytes >> 3) & 1;
90  int i = 0;
91 
92  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93  xmm1 = _mm256_setzero_ps();
94  xmm0 = _mm_load_ps((float*)src0);
95  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
96  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
97  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
98 
99  for (; i < bound; ++i) {
100  xmm2 = _mm256_load_ps((float*)&points[0]);
101  xmm3 = _mm256_load_ps((float*)&points[4]);
102  points += 8;
103 
104  xmm4 = _mm256_sub_ps(xmm1, xmm2);
105  xmm5 = _mm256_sub_ps(xmm1, xmm3);
106  xmm6 = _mm256_mul_ps(xmm4, xmm4);
107  xmm7 = _mm256_mul_ps(xmm5, xmm5);
108 
109  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
110  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
111 
112  _mm256_store_ps(target, xmm4);
113 
114  target += 8;
115  }
116 
117  for (i = 0; i < leftovers0; ++i) {
118 
119  xmm2 = _mm256_load_ps((float*)&points[0]);
120 
121  xmm4 = _mm256_sub_ps(xmm1, xmm2);
122 
123  points += 4;
124 
125  xmm6 = _mm256_mul_ps(xmm4, xmm4);
126 
127  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
128  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
129 
130  xmm9 = _mm256_extractf128_ps(xmm4, 1);
131  _mm_store_ps(target, xmm9);
132 
133  target += 4;
134  }
135 
136  for (i = 0; i < leftovers1; ++i) {
137  xmm9 = _mm_load_ps((float*)&points[0]);
138 
139  xmm10 = _mm_sub_ps(xmm0, xmm9);
140 
141  points += 2;
142 
143  xmm9 = _mm_mul_ps(xmm10, xmm10);
144 
145  xmm10 = _mm_hadd_ps(xmm9, xmm9);
146 
147  _mm_storeh_pi((__m64*)target, xmm10);
148 
149  target += 2;
150  }
151 
152  for (i = 0; i < leftovers2; ++i) {
153 
154  diff = src0[0] - points[0];
155 
156  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
157 
158  target[0] = sq_dist;
159  }
160 }
161 
162 #endif /*LV_HAVE_AVX2*/
163 
164 #ifdef LV_HAVE_SSE3
165 #include <pmmintrin.h>
166 #include <xmmintrin.h>
167 
168 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
169  lv_32fc_t* src0,
170  lv_32fc_t* points,
171  unsigned int num_points)
172 {
173  const unsigned int num_bytes = num_points * 8;
174 
175  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
176 
177  lv_32fc_t diff;
178  float sq_dist;
179  int bound = num_bytes >> 5;
180  int i = 0;
181 
182  xmm1 = _mm_setzero_ps();
183  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
184  xmm2 = _mm_load_ps((float*)&points[0]);
185  xmm1 = _mm_movelh_ps(xmm1, xmm1);
186  xmm3 = _mm_load_ps((float*)&points[2]);
187 
188  for (; i < bound - 1; ++i) {
189  xmm4 = _mm_sub_ps(xmm1, xmm2);
190  xmm5 = _mm_sub_ps(xmm1, xmm3);
191  points += 4;
192  xmm6 = _mm_mul_ps(xmm4, xmm4);
193  xmm7 = _mm_mul_ps(xmm5, xmm5);
194 
195  xmm2 = _mm_load_ps((float*)&points[0]);
196 
197  xmm4 = _mm_hadd_ps(xmm6, xmm7);
198 
199  xmm3 = _mm_load_ps((float*)&points[2]);
200 
201  _mm_store_ps(target, xmm4);
202 
203  target += 4;
204  }
205 
206  xmm4 = _mm_sub_ps(xmm1, xmm2);
207  xmm5 = _mm_sub_ps(xmm1, xmm3);
208 
209  points += 4;
210  xmm6 = _mm_mul_ps(xmm4, xmm4);
211  xmm7 = _mm_mul_ps(xmm5, xmm5);
212 
213  xmm4 = _mm_hadd_ps(xmm6, xmm7);
214 
215  _mm_store_ps(target, xmm4);
216 
217  target += 4;
218 
219  if (num_bytes >> 4 & 1) {
220 
221  xmm2 = _mm_load_ps((float*)&points[0]);
222 
223  xmm4 = _mm_sub_ps(xmm1, xmm2);
224 
225  points += 2;
226 
227  xmm6 = _mm_mul_ps(xmm4, xmm4);
228 
229  xmm4 = _mm_hadd_ps(xmm6, xmm6);
230 
231  _mm_storeh_pi((__m64*)target, xmm4);
232 
233  target += 2;
234  }
235 
236  if (num_bytes >> 3 & 1) {
237 
238  diff = src0[0] - points[0];
239 
240  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
241 
242  target[0] = sq_dist;
243  }
244 }
245 
246 #endif /*LV_HAVE_SSE3*/
247 
248 
249 #ifdef LV_HAVE_NEON
250 #include <arm_neon.h>
251 static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
252  lv_32fc_t* src0,
253  lv_32fc_t* points,
254  unsigned int num_points)
255 {
256  const unsigned int quarter_points = num_points / 4;
257  unsigned int number;
258 
259  float32x4x2_t a_vec, b_vec;
260  float32x4x2_t diff_vec;
261  float32x4_t tmp, tmp1, dist_sq;
262  a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
263  a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
264  for (number = 0; number < quarter_points; ++number) {
265  b_vec = vld2q_f32((float*)points);
266  diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
267  diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
268  tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
269  tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
270 
271  dist_sq = vaddq_f32(tmp, tmp1);
272  vst1q_f32(target, dist_sq);
273  points += 4;
274  target += 4;
275  }
276  for (number = quarter_points * 4; number < num_points; ++number) {
277  lv_32fc_t diff = src0[0] - *points++;
278  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
279  }
280 }
281 #endif /* LV_HAVE_NEON */
282 
283 
284 #ifdef LV_HAVE_GENERIC
285 static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
286  lv_32fc_t* src0,
287  lv_32fc_t* points,
288  unsigned int num_points)
289 {
290  const unsigned int num_bytes = num_points * 8;
291 
292  lv_32fc_t diff;
293  float sq_dist;
294  unsigned int i = 0;
295 
296  for (; i<num_bytes>> 3; ++i) {
297  diff = src0[0] - points[i];
298 
299  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
300 
301  target[i] = sq_dist;
302  }
303 }
304 
305 #endif /*LV_HAVE_GENERIC*/
306 
307 
308 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
309 
310 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
311 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
312 
313 #include <inttypes.h>
314 #include <stdio.h>
315 #include <volk/volk_complex.h>
316 
317 #ifdef LV_HAVE_AVX2
318 #include <immintrin.h>
319 
320 static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
321  lv_32fc_t* src0,
322  lv_32fc_t* points,
323  unsigned int num_points)
324 {
325  const unsigned int num_bytes = num_points * 8;
326  __m128 xmm0, xmm9;
327  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
328 
329  lv_32fc_t diff;
330  float sq_dist;
331  int bound = num_bytes >> 6;
332  int leftovers1 = (num_bytes >> 3) & 0b11;
333  int i = 0;
334 
335  __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
336  xmm1 = _mm256_setzero_ps();
337  xmm0 = _mm_loadu_ps((float*)src0);
338  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
339  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
340  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
341 
342  for (; i < bound; ++i) {
343  xmm2 = _mm256_loadu_ps((float*)&points[0]);
344  xmm3 = _mm256_loadu_ps((float*)&points[4]);
345  points += 8;
346 
347  xmm4 = _mm256_sub_ps(xmm1, xmm2);
348  xmm5 = _mm256_sub_ps(xmm1, xmm3);
349  xmm6 = _mm256_mul_ps(xmm4, xmm4);
350  xmm7 = _mm256_mul_ps(xmm5, xmm5);
351 
352  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
353  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
354 
355  _mm256_storeu_ps(target, xmm4);
356 
357  target += 8;
358  }
359 
360  if (num_bytes >> 5 & 1) {
361 
362  xmm2 = _mm256_loadu_ps((float*)&points[0]);
363 
364  xmm4 = _mm256_sub_ps(xmm1, xmm2);
365 
366  points += 4;
367 
368  xmm6 = _mm256_mul_ps(xmm4, xmm4);
369 
370  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
371  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
372 
373  xmm9 = _mm256_extractf128_ps(xmm4, 1);
374  _mm_storeu_ps(target, xmm9);
375 
376  target += 4;
377  }
378 
379  for (i = 0; i < leftovers1; ++i) {
380 
381  diff = src0[0] - points[0];
382  points += 1;
383 
384  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
385 
386  target[0] = sq_dist;
387  target += 1;
388  }
389 }
390 
391 #endif /*LV_HAVE_AVX2*/
392 
393 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition: sse2neon.h:1917
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
Definition: sse2neon.h:2145
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
int64x1_t __m64
Definition: sse2neon.h:234
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: sse2neon.h:2751
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:285
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:251
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:168
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13