63 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
64 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
71 #if LV_HAVE_AVX && LV_HAVE_FMA
72 #include <immintrin.h>
74 static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(
lv_32fc_t* cVector,
77 unsigned int num_points)
79 unsigned int number = 0;
81 const unsigned int quarterPoints = num_points / 4;
82 unsigned int isodd = num_points & 3;
83 __m256 x, yl, yh, z, tmp1, tmp2;
88 yl = _mm256_set1_ps(
lv_creal(scalar));
89 yh = _mm256_set1_ps(
lv_cimag(scalar));
91 for (; number < quarterPoints; number++) {
92 x = _mm256_loadu_ps((
float*)a);
96 x = _mm256_shuffle_ps(x, x, 0xB1);
98 tmp2 = _mm256_mul_ps(x, yh);
100 z = _mm256_fmaddsub_ps(
103 _mm256_storeu_ps((
float*)c, z);
109 for (
i = num_points - isodd;
i < num_points;
i++) {
110 *c++ = (*a++) * scalar;
116 #include <immintrin.h>
121 unsigned int num_points)
123 unsigned int number = 0;
125 const unsigned int quarterPoints = num_points / 4;
126 unsigned int isodd = num_points & 3;
127 __m256 x, yl, yh, z, tmp1, tmp2;
132 yl = _mm256_set1_ps(
lv_creal(scalar));
133 yh = _mm256_set1_ps(
lv_cimag(scalar));
135 for (; number < quarterPoints; number++) {
136 x = _mm256_loadu_ps((
float*)a);
138 tmp1 = _mm256_mul_ps(x, yl);
140 x = _mm256_shuffle_ps(x, x, 0xB1);
142 tmp2 = _mm256_mul_ps(x, yh);
144 z = _mm256_addsub_ps(tmp1,
147 _mm256_storeu_ps((
float*)c, z);
153 for (
i = num_points - isodd;
i < num_points;
i++) {
154 *c++ = (*a++) * scalar;
160 #include <pmmintrin.h>
165 unsigned int num_points)
167 unsigned int number = 0;
168 const unsigned int halfPoints = num_points / 2;
170 __m128 x, yl, yh, z, tmp1, tmp2;
178 for (; number < halfPoints; number++) {
197 if ((num_points % 2) != 0) {
203 #ifdef LV_HAVE_GENERIC
208 unsigned int num_points)
212 unsigned int number = num_points;
215 while (number >= 8) {
216 *cPtr++ = (*aPtr++) * scalar;
217 *cPtr++ = (*aPtr++) * scalar;
218 *cPtr++ = (*aPtr++) * scalar;
219 *cPtr++ = (*aPtr++) * scalar;
220 *cPtr++ = (*aPtr++) * scalar;
221 *cPtr++ = (*aPtr++) * scalar;
222 *cPtr++ = (*aPtr++) * scalar;
223 *cPtr++ = (*aPtr++) * scalar;
229 *cPtr++ = *aPtr++ * scalar;
235 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
236 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
239 #include <inttypes.h>
243 #if LV_HAVE_AVX && LV_HAVE_FMA
244 #include <immintrin.h>
246 static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(
lv_32fc_t* cVector,
249 unsigned int num_points)
251 unsigned int number = 0;
253 const unsigned int quarterPoints = num_points / 4;
254 unsigned int isodd = num_points & 3;
255 __m256 x, yl, yh, z, tmp1, tmp2;
260 yl = _mm256_set1_ps(
lv_creal(scalar));
261 yh = _mm256_set1_ps(
lv_cimag(scalar));
263 for (; number < quarterPoints; number++) {
264 x = _mm256_load_ps((
float*)a);
268 x = _mm256_shuffle_ps(x, x, 0xB1);
270 tmp2 = _mm256_mul_ps(x, yh);
272 z = _mm256_fmaddsub_ps(
275 _mm256_store_ps((
float*)c, z);
281 for (
i = num_points - isodd;
i < num_points;
i++) {
282 *c++ = (*a++) * scalar;
289 #include <immintrin.h>
294 unsigned int num_points)
296 unsigned int number = 0;
298 const unsigned int quarterPoints = num_points / 4;
299 unsigned int isodd = num_points & 3;
300 __m256 x, yl, yh, z, tmp1, tmp2;
305 yl = _mm256_set1_ps(
lv_creal(scalar));
306 yh = _mm256_set1_ps(
lv_cimag(scalar));
308 for (; number < quarterPoints; number++) {
309 x = _mm256_load_ps((
float*)a);
311 tmp1 = _mm256_mul_ps(x, yl);
313 x = _mm256_shuffle_ps(x, x, 0xB1);
315 tmp2 = _mm256_mul_ps(x, yh);
317 z = _mm256_addsub_ps(tmp1,
320 _mm256_store_ps((
float*)c, z);
326 for (
i = num_points - isodd;
i < num_points;
i++) {
327 *c++ = (*a++) * scalar;
333 #include <pmmintrin.h>
338 unsigned int num_points)
340 unsigned int number = 0;
341 const unsigned int halfPoints = num_points / 2;
343 __m128 x, yl, yh, z, tmp1, tmp2;
351 for (; number < halfPoints; number++) {
370 if ((num_points % 2) != 0) {
377 #include <arm_neon.h>
382 unsigned int num_points)
386 unsigned int number = num_points;
387 unsigned int quarter_points = num_points / 4;
389 float32x4x2_t a_val, scalar_val;
390 float32x4x2_t tmp_imag;
392 scalar_val.val[0] = vld1q_dup_f32((
const float*)&scalar);
393 scalar_val.val[1] = vld1q_dup_f32(((
const float*)&scalar) + 1);
394 for (number = 0; number < quarter_points; ++number) {
395 a_val = vld2q_f32((
float*)aPtr);
396 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
397 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
399 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
400 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
402 vst2q_f32((
float*)cPtr, tmp_imag);
407 for (number = quarter_points * 4; number < num_points; number++) {
408 *cPtr++ = *aPtr++ * scalar;
413 #ifdef LV_HAVE_GENERIC
418 unsigned int num_points)
422 unsigned int number = num_points;
425 while (number >= 8) {
426 *cPtr++ = (*aPtr++) * scalar;
427 *cPtr++ = (*aPtr++) * scalar;
428 *cPtr++ = (*aPtr++) * scalar;
429 *cPtr++ = (*aPtr++) * scalar;
430 *cPtr++ = (*aPtr++) * scalar;
431 *cPtr++ = (*aPtr++) * scalar;
432 *cPtr++ = (*aPtr++) * scalar;
433 *cPtr++ = (*aPtr++) * scalar;
439 *cPtr++ = *aPtr++ * scalar;
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:6496
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:415
static void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:205
static void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:118
static void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:162
static void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:291
static void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:335
static void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:379
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13