73 #ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
74 #define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
82 #ifdef LV_HAVE_GENERIC
89 unsigned int num_points)
94 unsigned int number = num_points;
98 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
99 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
100 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
101 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
102 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
103 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
104 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
105 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
110 while (number-- > 0) {
111 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
118 #include <immintrin.h>
126 unsigned int num_points)
128 unsigned int number = 0;
130 const unsigned int quarterPoints = num_points / 4;
131 unsigned int isodd = num_points & 3;
134 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
141 s = _mm256_loadu_ps((
float*)v_scalar);
143 for (; number < quarterPoints; number++) {
144 x = _mm256_loadu_ps((
float*)b);
145 y = _mm256_loadu_ps((
float*)a);
147 z = _mm256_add_ps(y, z);
148 _mm256_storeu_ps((
float*)c, z);
155 for (
i = num_points - isodd;
i < num_points;
i++) {
156 *c++ = (*a++) +
lv_conj(*b++) * scalar;
163 #include <pmmintrin.h>
171 unsigned int num_points)
173 unsigned int number = 0;
174 const unsigned int halfPoints = num_points / 2;
177 lv_32fc_t v_scalar[2] = { scalar, scalar };
186 for (; number < halfPoints; number++) {
198 if ((num_points % 2) != 0) {
199 *c = *a +
lv_conj(*b) * scalar;
206 #include <immintrin.h>
214 unsigned int num_points)
216 unsigned int number = 0;
218 const unsigned int quarterPoints = num_points / 4;
219 unsigned int isodd = num_points & 3;
222 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
229 s = _mm256_loadu_ps((
float*)v_scalar);
231 for (; number < quarterPoints; number++) {
232 x = _mm256_load_ps((
float*)b);
233 y = _mm256_load_ps((
float*)a);
235 z = _mm256_add_ps(y, z);
236 _mm256_store_ps((
float*)c, z);
243 for (
i = num_points - isodd;
i < num_points;
i++) {
244 *c++ = (*a++) +
lv_conj(*b++) * scalar;
251 #include <pmmintrin.h>
259 unsigned int num_points)
261 unsigned int number = 0;
262 const unsigned int halfPoints = num_points / 2;
265 lv_32fc_t v_scalar[2] = { scalar, scalar };
274 for (; number < halfPoints; number++) {
286 if ((num_points % 2) != 0) {
287 *c = *a +
lv_conj(*b) * scalar;
294 #include <arm_neon.h>
301 unsigned int num_points)
306 unsigned int number = num_points;
307 unsigned int quarter_points = num_points / 4;
309 float32x4x2_t a_val, b_val, c_val, scalar_val;
310 float32x4x2_t tmp_val;
312 scalar_val.val[0] = vld1q_dup_f32((
const float*)&scalar);
313 scalar_val.val[1] = vld1q_dup_f32(((
const float*)&scalar) + 1);
315 for (number = 0; number < quarter_points; ++number) {
316 a_val = vld2q_f32((
float*)aPtr);
317 b_val = vld2q_f32((
float*)bPtr);
318 b_val.val[1] = vnegq_f32(b_val.val[1]);
322 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
323 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
325 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
326 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
328 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
329 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
331 vst2q_f32((
float*)cPtr, c_val);
338 for (number = quarter_points * 4; number < num_points; number++) {
339 *cPtr++ = (*aPtr++) +
lv_conj(*bPtr++) * scalar;
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
Definition: sse2neon.h:2787
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
Definition: sse2neon.h:1941
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:210
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:85
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:167
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:255
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:122
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:297
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:38
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
#define lv_conj(x)
Definition: volk_complex.h:100
float complex lv_32fc_t
Definition: volk_complex.h:74
for i
Definition: volk_config_fixed.tmpl.h:13
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:31