42 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
43 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
49 #include <immintrin.h>
54 unsigned int num_points)
56 unsigned int number = 0;
57 const unsigned int eighthPoints = num_points / 8;
61 const float* bPtr = bVector;
63 __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
65 __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
67 for (; number < eighthPoints; number++) {
69 aVal1 = _mm256_load_ps((
float*)aPtr);
72 aVal2 = _mm256_load_ps((
float*)aPtr);
75 bVal = _mm256_load_ps(bPtr);
78 bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00);
79 bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11);
81 bVal1 = _mm256_permutevar_ps(bVal1, permute_mask);
82 bVal2 = _mm256_permutevar_ps(bVal2, permute_mask);
84 cVal1 = _mm256_mul_ps(aVal1, bVal1);
85 cVal2 = _mm256_mul_ps(aVal2, bVal2);
87 _mm256_store_ps((
float*)cPtr,
91 _mm256_store_ps((
float*)cPtr,
96 number = eighthPoints * 8;
97 for (; number < num_points; ++number) {
98 *cPtr++ = (*aPtr++) * (*bPtr++);
105 #include <xmmintrin.h>
109 const float* bVector,
110 unsigned int num_points)
112 unsigned int number = 0;
113 const unsigned int quarterPoints = num_points / 4;
117 const float* bPtr = bVector;
119 __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
120 for (; number < quarterPoints; number++) {
146 number = quarterPoints * 4;
147 for (; number < num_points; number++) {
148 *cPtr++ = (*aPtr++) * (*bPtr);
155 #ifdef LV_HAVE_GENERIC
159 const float* bVector,
160 unsigned int num_points)
164 const float* bPtr = bVector;
165 unsigned int number = 0;
167 for (number = 0; number < num_points; number++) {
168 *cPtr++ = (*aPtr++) * (*bPtr++);
175 #include <arm_neon.h>
179 const float* bVector,
180 unsigned int num_points)
184 const float* bPtr = bVector;
185 unsigned int number = 0;
186 unsigned int quarter_points = num_points / 4;
188 float32x4x2_t inputVector, outputVector;
189 float32x4_t tapsVector;
190 for (number = 0; number < quarter_points; number++) {
191 inputVector = vld2q_f32((
float*)aPtr);
192 tapsVector = vld1q_f32(bPtr);
194 outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
195 outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
197 vst2q_f32((
float*)cPtr, outputVector);
203 for (number = quarter_points * 4; number < num_points; number++) {
204 *cPtr++ = (*aPtr++) * (*bPtr++);
212 extern void volk_32fc_32f_multiply_32fc_a_orc_impl(
lv_32fc_t* cVector,
214 const float* bVector,
215 unsigned int num_points);
217 static inline void volk_32fc_32f_multiply_32fc_u_orc(
lv_32fc_t* cVector,
219 const float* bVector,
220 unsigned int num_points)
222 volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:177
static void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:157
static void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:51
static void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:107
float complex lv_32fc_t
Definition: volk_complex.h:74