Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
42 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
43 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
44 
45 #include <inttypes.h>
46 #include <stdio.h>
47 
48 #ifdef LV_HAVE_AVX
49 #include <immintrin.h>
50 
51 static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
52  const lv_32fc_t* aVector,
53  const float* bVector,
54  unsigned int num_points)
55 {
56  unsigned int number = 0;
57  const unsigned int eighthPoints = num_points / 8;
58 
59  lv_32fc_t* cPtr = cVector;
60  const lv_32fc_t* aPtr = aVector;
61  const float* bPtr = bVector;
62 
63  __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
64 
65  __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
66 
67  for (; number < eighthPoints; number++) {
68 
69  aVal1 = _mm256_load_ps((float*)aPtr);
70  aPtr += 4;
71 
72  aVal2 = _mm256_load_ps((float*)aPtr);
73  aPtr += 4;
74 
75  bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
76  bPtr += 8;
77 
78  bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
79  bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
80 
81  bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
82  bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
83 
84  cVal1 = _mm256_mul_ps(aVal1, bVal1);
85  cVal2 = _mm256_mul_ps(aVal2, bVal2);
86 
87  _mm256_store_ps((float*)cPtr,
88  cVal1); // Store the results back into the C container
89  cPtr += 4;
90 
91  _mm256_store_ps((float*)cPtr,
92  cVal2); // Store the results back into the C container
93  cPtr += 4;
94  }
95 
96  number = eighthPoints * 8;
97  for (; number < num_points; ++number) {
98  *cPtr++ = (*aPtr++) * (*bPtr++);
99  }
100 }
101 #endif /* LV_HAVE_AVX */
102 
103 
104 #ifdef LV_HAVE_SSE
105 #include <xmmintrin.h>
106 
107 static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
108  const lv_32fc_t* aVector,
109  const float* bVector,
110  unsigned int num_points)
111 {
112  unsigned int number = 0;
113  const unsigned int quarterPoints = num_points / 4;
114 
115  lv_32fc_t* cPtr = cVector;
116  const lv_32fc_t* aPtr = aVector;
117  const float* bPtr = bVector;
118 
119  __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
120  for (; number < quarterPoints; number++) {
121 
122  aVal1 = _mm_load_ps((const float*)aPtr);
123  aPtr += 2;
124 
125  aVal2 = _mm_load_ps((const float*)aPtr);
126  aPtr += 2;
127 
128  bVal = _mm_load_ps(bPtr);
129  bPtr += 4;
130 
131  bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
132  bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
133 
134  cVal = _mm_mul_ps(aVal1, bVal1);
135 
136  _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
137  cPtr += 2;
138 
139  cVal = _mm_mul_ps(aVal2, bVal2);
140 
141  _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
142 
143  cPtr += 2;
144  }
145 
146  number = quarterPoints * 4;
147  for (; number < num_points; number++) {
148  *cPtr++ = (*aPtr++) * (*bPtr);
149  bPtr++;
150  }
151 }
152 #endif /* LV_HAVE_SSE */
153 
154 
155 #ifdef LV_HAVE_GENERIC
156 
157 static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
158  const lv_32fc_t* aVector,
159  const float* bVector,
160  unsigned int num_points)
161 {
162  lv_32fc_t* cPtr = cVector;
163  const lv_32fc_t* aPtr = aVector;
164  const float* bPtr = bVector;
165  unsigned int number = 0;
166 
167  for (number = 0; number < num_points; number++) {
168  *cPtr++ = (*aPtr++) * (*bPtr++);
169  }
170 }
171 #endif /* LV_HAVE_GENERIC */
172 
173 
174 #ifdef LV_HAVE_NEON
175 #include <arm_neon.h>
176 
177 static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
178  const lv_32fc_t* aVector,
179  const float* bVector,
180  unsigned int num_points)
181 {
182  lv_32fc_t* cPtr = cVector;
183  const lv_32fc_t* aPtr = aVector;
184  const float* bPtr = bVector;
185  unsigned int number = 0;
186  unsigned int quarter_points = num_points / 4;
187 
188  float32x4x2_t inputVector, outputVector;
189  float32x4_t tapsVector;
190  for (number = 0; number < quarter_points; number++) {
191  inputVector = vld2q_f32((float*)aPtr);
192  tapsVector = vld1q_f32(bPtr);
193 
194  outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
195  outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
196 
197  vst2q_f32((float*)cPtr, outputVector);
198  aPtr += 4;
199  bPtr += 4;
200  cPtr += 4;
201  }
202 
203  for (number = quarter_points * 4; number < num_points; number++) {
204  *cPtr++ = (*aPtr++) * (*bPtr++);
205  }
206 }
207 #endif /* LV_HAVE_NEON */
208 
209 
210 #ifdef LV_HAVE_ORC
211 
212 extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
213  const lv_32fc_t* aVector,
214  const float* bVector,
215  unsigned int num_points);
216 
217 static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
218  const lv_32fc_t* aVector,
219  const float* bVector,
220  unsigned int num_points)
221 {
222  volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
223 }
224 
225 #endif /* LV_HAVE_GENERIC */
226 
227 
228 #endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
float32x4_t __m128
Definition: sse2neon.h:235
#define _mm_shuffle_ps(a, b, imm)
Definition: sse2neon.h:2586
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
static void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:177
static void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:157
static void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:51
static void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:107
float complex lv_32fc_t
Definition: volk_complex.h:74