Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_add_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
61 #ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
62 #define INCLUDED_volk_32fc_32f_add_32fc_u_H
63 
64 #ifdef LV_HAVE_GENERIC
65 
66 static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
67  const lv_32fc_t* aVector,
68  const float* bVector,
69  unsigned int num_points)
70 {
71  lv_32fc_t* cPtr = cVector;
72  const lv_32fc_t* aPtr = aVector;
73  const float* bPtr = bVector;
74  unsigned int number = 0;
75 
76  for (number = 0; number < num_points; number++) {
77  *cPtr++ = (*aPtr++) + (*bPtr++);
78  }
79 }
80 #endif /* LV_HAVE_GENERIC */
81 
82 
83 #ifdef LV_HAVE_AVX
84 #include <immintrin.h>
85 
86 static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
87  const lv_32fc_t* aVector,
88  const float* bVector,
89  unsigned int num_points)
90 {
91  unsigned int number = 0;
92  const unsigned int eighthPoints = num_points / 8;
93 
94  lv_32fc_t* cPtr = cVector;
95  const lv_32fc_t* aPtr = aVector;
96  const float* bPtr = bVector;
97 
98  __m256 aVal1, aVal2, bVal, cVal1, cVal2;
99  __m256 cpx_b1, cpx_b2;
100  __m256 zero;
101  zero = _mm256_setzero_ps();
102  __m256 tmp1, tmp2;
103  for (; number < eighthPoints; number++) {
104 
105  aVal1 = _mm256_loadu_ps((float*)aPtr);
106  aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
107  bVal = _mm256_loadu_ps(bPtr);
108  cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
109  cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
110 
111  tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
112  tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
113 
114  cVal1 = _mm256_add_ps(aVal1, tmp1);
115  cVal2 = _mm256_add_ps(aVal2, tmp2);
116 
117  _mm256_storeu_ps((float*)cPtr,
118  cVal1); // Store the results back into the C container
119  _mm256_storeu_ps((float*)(cPtr + 4),
120  cVal2); // Store the results back into the C container
121 
122  aPtr += 8;
123  bPtr += 8;
124  cPtr += 8;
125  }
126 
127  number = eighthPoints * 8;
128  for (; number < num_points; number++) {
129  *cPtr++ = (*aPtr++) + (*bPtr++);
130  }
131 }
132 #endif /* LV_HAVE_AVX */
133 
134 #ifdef LV_HAVE_AVX
135 #include <immintrin.h>
136 
137 static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
138  const lv_32fc_t* aVector,
139  const float* bVector,
140  unsigned int num_points)
141 {
142  unsigned int number = 0;
143  const unsigned int eighthPoints = num_points / 8;
144 
145  lv_32fc_t* cPtr = cVector;
146  const lv_32fc_t* aPtr = aVector;
147  const float* bPtr = bVector;
148 
149  __m256 aVal1, aVal2, bVal, cVal1, cVal2;
150  __m256 cpx_b1, cpx_b2;
151  __m256 zero;
152  zero = _mm256_setzero_ps();
153  __m256 tmp1, tmp2;
154  for (; number < eighthPoints; number++) {
155 
156  aVal1 = _mm256_load_ps((float*)aPtr);
157  aVal2 = _mm256_load_ps((float*)(aPtr + 4));
158  bVal = _mm256_load_ps(bPtr);
159  cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
160  cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
161 
162  tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
163  tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
164 
165  cVal1 = _mm256_add_ps(aVal1, tmp1);
166  cVal2 = _mm256_add_ps(aVal2, tmp2);
167 
168  _mm256_store_ps((float*)cPtr,
169  cVal1); // Store the results back into the C container
170  _mm256_store_ps((float*)(cPtr + 4),
171  cVal2); // Store the results back into the C container
172 
173  aPtr += 8;
174  bPtr += 8;
175  cPtr += 8;
176  }
177 
178  number = eighthPoints * 8;
179  for (; number < num_points; number++) {
180  *cPtr++ = (*aPtr++) + (*bPtr++);
181  }
182 }
183 #endif /* LV_HAVE_AVX */
184 
185 #ifdef LV_HAVE_NEON
186 #include <arm_neon.h>
187 
188 static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
189  const lv_32fc_t* aVector,
190  const float* bVector,
191  unsigned int num_points)
192 {
193  lv_32fc_t* cPtr = cVector;
194  const lv_32fc_t* aPtr = aVector;
195  const float* bPtr = bVector;
196 
197  float32x4x4_t aVal0, aVal1;
198  float32x4x2_t bVal0, bVal1;
199 
200  const unsigned int sixteenthPoints = num_points / 16;
201  unsigned int number = 0;
202  for (; number < sixteenthPoints; number++) {
203  aVal0 = vld4q_f32((const float*)aPtr);
204  aPtr += 8;
205  aVal1 = vld4q_f32((const float*)aPtr);
206  aPtr += 8;
207  __VOLK_PREFETCH(aPtr + 16);
208 
209  bVal0 = vld2q_f32((const float*)bPtr);
210  bPtr += 8;
211  bVal1 = vld2q_f32((const float*)bPtr);
212  bPtr += 8;
213  __VOLK_PREFETCH(bPtr + 16);
214 
215  aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
216  aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
217 
218  aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
219  aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
220 
221  vst4q_f32((float*)(cPtr), aVal0);
222  cPtr += 8;
223  vst4q_f32((float*)(cPtr), aVal1);
224  cPtr += 8;
225  }
226 
227  for (number = sixteenthPoints * 16; number < num_points; number++) {
228  *cPtr++ = (*aPtr++) + (*bPtr++);
229  }
230 }
231 #endif /* LV_HAVE_NEON */
232 
233 
234 #endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
static void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:137
static void volk_32fc_32f_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:66
static void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:86
static void volk_32fc_32f_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:188
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71
float complex lv_32fc_t
Definition: volk_complex.h:74