Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_x2_rotator_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
68 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
69 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
70 
71 
72 #include <math.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <volk/volk_complex.h>
76 #define ROTATOR_RELOAD 512
77 #define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78 #define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
79 
80 
81 #ifdef LV_HAVE_GENERIC
82 
83 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
84  const lv_32fc_t* inVector,
85  const lv_32fc_t phase_inc,
86  lv_32fc_t* phase,
87  unsigned int num_points)
88 {
89  unsigned int i = 0;
90  int j = 0;
91  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
92  for (j = 0; j < ROTATOR_RELOAD; ++j) {
93  *outVector++ = *inVector++ * (*phase);
94  (*phase) *= phase_inc;
95  }
96 
97  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
98  }
99  for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
100  *outVector++ = *inVector++ * (*phase);
101  (*phase) *= phase_inc;
102  }
103  if (i) {
104  // Make sure, we normalize phase on every call!
105  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
106  }
107 }
108 
109 #endif /* LV_HAVE_GENERIC */
110 
111 
112 #ifdef LV_HAVE_NEON
113 #include <arm_neon.h>
115 
116 static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
117  const lv_32fc_t* inVector,
118  const lv_32fc_t phase_inc,
119  lv_32fc_t* phase,
120  unsigned int num_points)
121 
122 {
123  lv_32fc_t* outputVectorPtr = outVector;
124  const lv_32fc_t* inputVectorPtr = inVector;
125  lv_32fc_t incr = 1;
126  lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
127  float32x4x2_t input_vec;
128  float32x4x2_t output_vec;
129 
130  unsigned int i = 0, j = 0;
131  // const unsigned int quarter_points = num_points / 4;
132 
133  for (i = 0; i < 4; ++i) {
134  phasePtr[i] *= incr;
135  incr *= (phase_inc);
136  }
137 
138  // Notice that incr has be incremented in the previous loop
139  const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
140  const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
141  float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
142 
143  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
144  for (j = 0; j < ROTATOR_RELOAD_4; j++) {
145  input_vec = vld2q_f32((float*)inputVectorPtr);
146  // Prefetch next one, speeds things up
147  __VOLK_PREFETCH(inputVectorPtr + 4);
148  // Rotate
149  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
150  // Increase phase
151  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
152  // Store output
153  vst2q_f32((float*)outputVectorPtr, output_vec);
154 
155  outputVectorPtr += 4;
156  inputVectorPtr += 4;
157  }
158  // normalize phase so magnitude doesn't grow because of
159  // floating point rounding error
160  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
161  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
162  // Multiply complex with real
163  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
164  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
165  }
166 
167  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
168  input_vec = vld2q_f32((float*)inputVectorPtr);
169  // Prefetch next one, speeds things up
170  __VOLK_PREFETCH(inputVectorPtr + 4);
171  // Rotate
172  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
173  // Increase phase
174  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
175  // Store output
176  vst2q_f32((float*)outputVectorPtr, output_vec);
177 
178  outputVectorPtr += 4;
179  inputVectorPtr += 4;
180  }
181  // if(i) == true means we looped above
182  if (i) {
183  // normalize phase so magnitude doesn't grow because of
184  // floating point rounding error
185  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
186  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
187  // Multiply complex with real
188  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
189  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
190  }
191  // Store current phase
192  vst2q_f32((float*)phasePtr, phase_vec);
193 
194  // Deal with the rest
195  for (i = 0; i < num_points % 4; i++) {
196  *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
197  phasePtr[0] *= (phase_inc);
198  }
199 
200  // For continuous phase next time we need to call this function
201  (*phase) = phasePtr[0];
202 }
203 
204 #endif /* LV_HAVE_NEON */
205 
206 
207 #ifdef LV_HAVE_SSE4_1
208 #include <smmintrin.h>
209 
210 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
211  const lv_32fc_t* inVector,
212  const lv_32fc_t phase_inc,
213  lv_32fc_t* phase,
214  unsigned int num_points)
215 {
216  lv_32fc_t* cPtr = outVector;
217  const lv_32fc_t* aPtr = inVector;
218  lv_32fc_t incr = 1;
219  lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
220 
221  unsigned int i, j = 0;
222 
223  for (i = 0; i < 2; ++i) {
224  phase_Ptr[i] *= incr;
225  incr *= (phase_inc);
226  }
227 
228  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
229 
230  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
231  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
232 
233  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
234  for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
235 
236  aVal = _mm_load_ps((float*)aPtr);
237 
238  yl = _mm_moveldup_ps(phase_Val);
239  yh = _mm_movehdup_ps(phase_Val);
240  ylp = _mm_moveldup_ps(inc_Val);
241  yhp = _mm_movehdup_ps(inc_Val);
242 
243  tmp1 = _mm_mul_ps(aVal, yl);
244  tmp1p = _mm_mul_ps(phase_Val, ylp);
245 
246  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
247  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
248  tmp2 = _mm_mul_ps(aVal, yh);
249  tmp2p = _mm_mul_ps(phase_Val, yhp);
250 
251  z = _mm_addsub_ps(tmp1, tmp2);
252  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
253 
254  _mm_store_ps((float*)cPtr, z);
255 
256  aPtr += 2;
257  cPtr += 2;
258  }
259  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
260  tmp2 = _mm_hadd_ps(tmp1, tmp1);
261  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
262  tmp2 = _mm_sqrt_ps(tmp1);
263  phase_Val = _mm_div_ps(phase_Val, tmp2);
264  }
265  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
266  aVal = _mm_load_ps((float*)aPtr);
267 
268  yl = _mm_moveldup_ps(phase_Val);
269  yh = _mm_movehdup_ps(phase_Val);
270  ylp = _mm_moveldup_ps(inc_Val);
271  yhp = _mm_movehdup_ps(inc_Val);
272 
273  tmp1 = _mm_mul_ps(aVal, yl);
274 
275  tmp1p = _mm_mul_ps(phase_Val, ylp);
276 
277  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
278  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
279  tmp2 = _mm_mul_ps(aVal, yh);
280  tmp2p = _mm_mul_ps(phase_Val, yhp);
281 
282  z = _mm_addsub_ps(tmp1, tmp2);
283  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
284 
285  _mm_store_ps((float*)cPtr, z);
286 
287  aPtr += 2;
288  cPtr += 2;
289  }
290  if (i) {
291  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
292  tmp2 = _mm_hadd_ps(tmp1, tmp1);
293  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
294  tmp2 = _mm_sqrt_ps(tmp1);
295  phase_Val = _mm_div_ps(phase_Val, tmp2);
296  }
297 
298  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
299  if (num_points & 1) {
300  *cPtr++ = *aPtr++ * phase_Ptr[0];
301  phase_Ptr[0] *= (phase_inc);
302  }
303 
304  (*phase) = phase_Ptr[0];
305 }
306 
307 #endif /* LV_HAVE_SSE4_1 for aligned */
308 
309 
310 #ifdef LV_HAVE_SSE4_1
311 #include <smmintrin.h>
312 
313 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
314  const lv_32fc_t* inVector,
315  const lv_32fc_t phase_inc,
316  lv_32fc_t* phase,
317  unsigned int num_points)
318 {
319  lv_32fc_t* cPtr = outVector;
320  const lv_32fc_t* aPtr = inVector;
321  lv_32fc_t incr = 1;
322  lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
323 
324  unsigned int i, j = 0;
325 
326  for (i = 0; i < 2; ++i) {
327  phase_Ptr[i] *= incr;
328  incr *= (phase_inc);
329  }
330 
331  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
332  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
333  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
334  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
335 
336  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
337  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
338 
339  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
340  for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
341 
342  aVal = _mm_loadu_ps((float*)aPtr);
343 
344  yl = _mm_moveldup_ps(phase_Val);
345  yh = _mm_movehdup_ps(phase_Val);
346  ylp = _mm_moveldup_ps(inc_Val);
347  yhp = _mm_movehdup_ps(inc_Val);
348 
349  tmp1 = _mm_mul_ps(aVal, yl);
350  tmp1p = _mm_mul_ps(phase_Val, ylp);
351 
352  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
353  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
354  tmp2 = _mm_mul_ps(aVal, yh);
355  tmp2p = _mm_mul_ps(phase_Val, yhp);
356 
357  z = _mm_addsub_ps(tmp1, tmp2);
358  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
359 
360  _mm_storeu_ps((float*)cPtr, z);
361 
362  aPtr += 2;
363  cPtr += 2;
364  }
365  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
366  tmp2 = _mm_hadd_ps(tmp1, tmp1);
367  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
368  tmp2 = _mm_sqrt_ps(tmp1);
369  phase_Val = _mm_div_ps(phase_Val, tmp2);
370  }
371  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
372  aVal = _mm_loadu_ps((float*)aPtr);
373 
374  yl = _mm_moveldup_ps(phase_Val);
375  yh = _mm_movehdup_ps(phase_Val);
376  ylp = _mm_moveldup_ps(inc_Val);
377  yhp = _mm_movehdup_ps(inc_Val);
378 
379  tmp1 = _mm_mul_ps(aVal, yl);
380 
381  tmp1p = _mm_mul_ps(phase_Val, ylp);
382 
383  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
384  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
385  tmp2 = _mm_mul_ps(aVal, yh);
386  tmp2p = _mm_mul_ps(phase_Val, yhp);
387 
388  z = _mm_addsub_ps(tmp1, tmp2);
389  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
390 
391  _mm_storeu_ps((float*)cPtr, z);
392 
393  aPtr += 2;
394  cPtr += 2;
395  }
396  if (i) {
397  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
398  tmp2 = _mm_hadd_ps(tmp1, tmp1);
399  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
400  tmp2 = _mm_sqrt_ps(tmp1);
401  phase_Val = _mm_div_ps(phase_Val, tmp2);
402  }
403 
404  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
405  if (num_points & 1) {
406  *cPtr++ = *aPtr++ * phase_Ptr[0];
407  phase_Ptr[0] *= (phase_inc);
408  }
409 
410  (*phase) = phase_Ptr[0];
411 }
412 
413 #endif /* LV_HAVE_SSE4_1 */
414 
415 
416 #ifdef LV_HAVE_AVX
417 #include <immintrin.h>
419 
420 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
421  const lv_32fc_t* inVector,
422  const lv_32fc_t phase_inc,
423  lv_32fc_t* phase,
424  unsigned int num_points)
425 {
426  lv_32fc_t* cPtr = outVector;
427  const lv_32fc_t* aPtr = inVector;
428  lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
429  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
430 
431  unsigned int i, j = 0;
432 
433  for (i = 0; i < 4; ++i) {
434  phase_Ptr[i] *= incr;
435  incr *= (phase_inc);
436  }
437 
438  __m256 aVal, phase_Val, z;
439 
440  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
441 
442  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
443  lv_creal(incr),
444  lv_cimag(incr),
445  lv_creal(incr),
446  lv_cimag(incr),
447  lv_creal(incr),
448  lv_cimag(incr),
449  lv_creal(incr));
450 
451  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
452  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
453 
454  aVal = _mm256_load_ps((float*)aPtr);
455 
456  z = _mm256_complexmul_ps(aVal, phase_Val);
457  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
458 
459  _mm256_store_ps((float*)cPtr, z);
460 
461  aPtr += 4;
462  cPtr += 4;
463  }
464  phase_Val = _mm256_normalize_ps(phase_Val);
465  }
466 
467  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
468  aVal = _mm256_load_ps((float*)aPtr);
469 
470  z = _mm256_complexmul_ps(aVal, phase_Val);
471  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
472 
473  _mm256_store_ps((float*)cPtr, z);
474 
475  aPtr += 4;
476  cPtr += 4;
477  }
478  if (i) {
479  phase_Val = _mm256_normalize_ps(phase_Val);
480  }
481 
482  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
483  (*phase) = phase_Ptr[0];
484  volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
485 }
486 
487 #endif /* LV_HAVE_AVX for aligned */
488 
489 
490 #ifdef LV_HAVE_AVX
491 #include <immintrin.h>
493 
494 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
495  const lv_32fc_t* inVector,
496  const lv_32fc_t phase_inc,
497  lv_32fc_t* phase,
498  unsigned int num_points)
499 {
500  lv_32fc_t* cPtr = outVector;
501  const lv_32fc_t* aPtr = inVector;
502  lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
503  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
504 
505  unsigned int i, j = 0;
506 
507  for (i = 0; i < 4; ++i) {
508  phase_Ptr[i] *= incr;
509  incr *= (phase_inc);
510  }
511 
512  __m256 aVal, phase_Val, z;
513 
514  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
515 
516  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
517  lv_creal(incr),
518  lv_cimag(incr),
519  lv_creal(incr),
520  lv_cimag(incr),
521  lv_creal(incr),
522  lv_cimag(incr),
523  lv_creal(incr));
524 
525  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
526  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
527 
528  aVal = _mm256_loadu_ps((float*)aPtr);
529 
530  z = _mm256_complexmul_ps(aVal, phase_Val);
531  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
532 
533  _mm256_storeu_ps((float*)cPtr, z);
534 
535  aPtr += 4;
536  cPtr += 4;
537  }
538  phase_Val = _mm256_normalize_ps(phase_Val);
539  }
540 
541  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
542  aVal = _mm256_loadu_ps((float*)aPtr);
543 
544  z = _mm256_complexmul_ps(aVal, phase_Val);
545  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
546 
547  _mm256_storeu_ps((float*)cPtr, z);
548 
549  aPtr += 4;
550  cPtr += 4;
551  }
552  if (i) {
553  phase_Val = _mm256_normalize_ps(phase_Val);
554  }
555 
556  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
557  (*phase) = phase_Ptr[0];
558  volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
559 }
560 
561 #endif /* LV_HAVE_AVX */
562 
563 #if LV_HAVE_AVX && LV_HAVE_FMA
564 #include <immintrin.h>
565 
566 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
567  const lv_32fc_t* inVector,
568  const lv_32fc_t phase_inc,
569  lv_32fc_t* phase,
570  unsigned int num_points)
571 {
572  lv_32fc_t* cPtr = outVector;
573  const lv_32fc_t* aPtr = inVector;
574  lv_32fc_t incr = 1;
576  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
577 
578  unsigned int i, j = 0;
579 
580  for (i = 0; i < 4; ++i) {
581  phase_Ptr[i] *= incr;
582  incr *= (phase_inc);
583  }
584 
585  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
586 
587  phase_Val = _mm256_load_ps((float*)phase_Ptr);
588  inc_Val = _mm256_set_ps(lv_cimag(incr),
589  lv_creal(incr),
590  lv_cimag(incr),
591  lv_creal(incr),
592  lv_cimag(incr),
593  lv_creal(incr),
594  lv_cimag(incr),
595  lv_creal(incr));
596 
597  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
598  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
599 
600  aVal = _mm256_load_ps((float*)aPtr);
601 
602  yl = _mm256_moveldup_ps(phase_Val);
603  yh = _mm256_movehdup_ps(phase_Val);
604  ylp = _mm256_moveldup_ps(inc_Val);
605  yhp = _mm256_movehdup_ps(inc_Val);
606 
607  tmp1 = aVal;
608  tmp1p = phase_Val;
609 
610  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
611  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
612  tmp2 = _mm256_mul_ps(aVal, yh);
613  tmp2p = _mm256_mul_ps(phase_Val, yhp);
614 
615  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
616  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
617 
618  _mm256_store_ps((float*)cPtr, z);
619 
620  aPtr += 4;
621  cPtr += 4;
622  }
623  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
624  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
625  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
626  tmp2 = _mm256_sqrt_ps(tmp1);
627  phase_Val = _mm256_div_ps(phase_Val, tmp2);
628  }
629  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
630  aVal = _mm256_load_ps((float*)aPtr);
631 
632  yl = _mm256_moveldup_ps(phase_Val);
633  yh = _mm256_movehdup_ps(phase_Val);
634  ylp = _mm256_moveldup_ps(inc_Val);
635  yhp = _mm256_movehdup_ps(inc_Val);
636 
637  tmp1 = aVal;
638  tmp1p = phase_Val;
639 
640  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
641  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
642  tmp2 = _mm256_mul_ps(aVal, yh);
643  tmp2p = _mm256_mul_ps(phase_Val, yhp);
644 
645  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
646  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
647 
648  _mm256_store_ps((float*)cPtr, z);
649 
650  aPtr += 4;
651  cPtr += 4;
652  }
653  if (i) {
654  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
655  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
656  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
657  tmp2 = _mm256_sqrt_ps(tmp1);
658  phase_Val = _mm256_div_ps(phase_Val, tmp2);
659  }
660 
661  _mm256_store_ps((float*)phase_Ptr, phase_Val);
662  for (i = 0; i < num_points % 4; ++i) {
663  *cPtr++ = *aPtr++ * phase_Ptr[0];
664  phase_Ptr[0] *= (phase_inc);
665  }
666 
667  (*phase) = phase_Ptr[0];
668 }
669 
670 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
671 
672 #if LV_HAVE_AVX && LV_HAVE_FMA
673 #include <immintrin.h>
674 
675 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
676  const lv_32fc_t* inVector,
677  const lv_32fc_t phase_inc,
678  lv_32fc_t* phase,
679  unsigned int num_points)
680 {
681  lv_32fc_t* cPtr = outVector;
682  const lv_32fc_t* aPtr = inVector;
683  lv_32fc_t incr = 1;
684  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
685 
686  unsigned int i, j = 0;
687 
688  for (i = 0; i < 4; ++i) {
689  phase_Ptr[i] *= incr;
690  incr *= (phase_inc);
691  }
692 
693  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
694 
695  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
696  inc_Val = _mm256_set_ps(lv_cimag(incr),
697  lv_creal(incr),
698  lv_cimag(incr),
699  lv_creal(incr),
700  lv_cimag(incr),
701  lv_creal(incr),
702  lv_cimag(incr),
703  lv_creal(incr));
704 
705  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
706  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
707 
708  aVal = _mm256_loadu_ps((float*)aPtr);
709 
710  yl = _mm256_moveldup_ps(phase_Val);
711  yh = _mm256_movehdup_ps(phase_Val);
712  ylp = _mm256_moveldup_ps(inc_Val);
713  yhp = _mm256_movehdup_ps(inc_Val);
714 
715  tmp1 = aVal;
716  tmp1p = phase_Val;
717 
718  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
719  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
720  tmp2 = _mm256_mul_ps(aVal, yh);
721  tmp2p = _mm256_mul_ps(phase_Val, yhp);
722 
723  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
724  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
725 
726  _mm256_storeu_ps((float*)cPtr, z);
727 
728  aPtr += 4;
729  cPtr += 4;
730  }
731  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
732  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
733  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
734  tmp2 = _mm256_sqrt_ps(tmp1);
735  phase_Val = _mm256_div_ps(phase_Val, tmp2);
736  }
737  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
738  aVal = _mm256_loadu_ps((float*)aPtr);
739 
740  yl = _mm256_moveldup_ps(phase_Val);
741  yh = _mm256_movehdup_ps(phase_Val);
742  ylp = _mm256_moveldup_ps(inc_Val);
743  yhp = _mm256_movehdup_ps(inc_Val);
744 
745  tmp1 = aVal;
746  tmp1p = phase_Val;
747 
748  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
749  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
750  tmp2 = _mm256_mul_ps(aVal, yh);
751  tmp2p = _mm256_mul_ps(phase_Val, yhp);
752 
753  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
754  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
755 
756  _mm256_storeu_ps((float*)cPtr, z);
757 
758  aPtr += 4;
759  cPtr += 4;
760  }
761  if (i) {
762  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
763  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
764  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
765  tmp2 = _mm256_sqrt_ps(tmp1);
766  phase_Val = _mm256_div_ps(phase_Val, tmp2);
767  }
768 
769  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
770  for (i = 0; i < num_points % 4; ++i) {
771  *cPtr++ = *aPtr++ * phase_Ptr[0];
772  phase_Ptr[0] *= (phase_inc);
773  }
774 
775  (*phase) = phase_Ptr[0];
776 }
777 
778 #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
779 
780 #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
Definition: sse2neon.h:6611
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6527
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1756
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
Definition: sse2neon.h:6627
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
Definition: sse2neon.h:2659
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:116
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:494
#define ROTATOR_RELOAD_4
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:78
#define ROTATOR_RELOAD_2
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:77
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:76
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:83
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:420
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:19
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:51
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:83
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:105
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73