Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_64f_add_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_64f_add_64f_H
61 #define INCLUDED_volk_32f_64f_add_64f_H
62 
63 #include <inttypes.h>
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 static inline void volk_32f_64f_add_64f_generic(double* cVector,
68  const float* aVector,
69  const double* bVector,
70  unsigned int num_points)
71 {
72  double* cPtr = cVector;
73  const float* aPtr = aVector;
74  const double* bPtr = bVector;
75  unsigned int number = 0;
76 
77  for (number = 0; number < num_points; number++) {
78  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
79  }
80 }
81 
82 #endif /* LV_HAVE_GENERIC */
83 
84 #ifdef LV_HAVE_NEONV8
85 #include <arm_neon.h>
86 
87 static inline void volk_32f_64f_add_64f_neon(double* cVector,
88  const float* aVector,
89  const double* bVector,
90  unsigned int num_points)
91 {
92  unsigned int number = 0;
93  const unsigned int half_points = num_points / 2;
94 
95  double* cPtr = cVector;
96  const float* aPtr = aVector;
97  const double* bPtr = bVector;
98 
99  float64x2_t aVal, bVal, cVal;
100  float32x2_t aVal1;
101  for (number = 0; number < half_points; number++) {
102  // Load in to NEON registers
103  aVal1 = vld1_f32(aPtr);
104  bVal = vld1q_f64(bPtr);
105  __VOLK_PREFETCH(aPtr + 2);
106  __VOLK_PREFETCH(bPtr + 2);
107  aPtr += 2; // q uses quadwords, 4 floats per vadd
108  bPtr += 2;
109 
110  // Vector conversion
111  aVal = vcvt_f64_f32(aVal1);
112  // vector add
113  cVal = vaddq_f64(aVal, bVal);
114  // Store the results back into the C container
115  vst1q_f64(cPtr, cVal);
116 
117  cPtr += 2;
118  }
119 
120  number = half_points * 2; // should be = num_points
121  for (; number < num_points; number++) {
122  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
123  }
124 }
125 
126 #endif /* LV_HAVE_NEONV8 */
127 
128 #ifdef LV_HAVE_AVX
129 
130 #include <immintrin.h>
131 #include <xmmintrin.h>
132 
133 static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
134  const float* aVector,
135  const double* bVector,
136  unsigned int num_points)
137 {
138  unsigned int number = 0;
139  const unsigned int eighth_points = num_points / 8;
140 
141  double* cPtr = cVector;
142  const float* aPtr = aVector;
143  const double* bPtr = bVector;
144 
145  __m256 aVal;
146  __m128 aVal1, aVal2;
147  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
148  for (; number < eighth_points; number++) {
149 
150  aVal = _mm256_loadu_ps(aPtr);
151  bVal1 = _mm256_loadu_pd(bPtr);
152  bVal2 = _mm256_loadu_pd(bPtr + 4);
153 
154  aVal1 = _mm256_extractf128_ps(aVal, 0);
155  aVal2 = _mm256_extractf128_ps(aVal, 1);
156 
157  aDbl1 = _mm256_cvtps_pd(aVal1);
158  aDbl2 = _mm256_cvtps_pd(aVal2);
159 
160  cVal1 = _mm256_add_pd(aDbl1, bVal1);
161  cVal2 = _mm256_add_pd(aDbl2, bVal2);
162 
163  _mm256_storeu_pd(cPtr,
164  cVal1); // Store the results back into the C container
165  _mm256_storeu_pd(cPtr + 4,
166  cVal2); // Store the results back into the C container
167 
168  aPtr += 8;
169  bPtr += 8;
170  cPtr += 8;
171  }
172 
173  number = eighth_points * 8;
174  for (; number < num_points; number++) {
175  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
176  }
177 }
178 
179 #endif /* LV_HAVE_AVX */
180 
181 #ifdef LV_HAVE_AVX
182 
183 #include <immintrin.h>
184 #include <xmmintrin.h>
185 
186 static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
187  const float* aVector,
188  const double* bVector,
189  unsigned int num_points)
190 {
191  unsigned int number = 0;
192  const unsigned int eighth_points = num_points / 8;
193 
194  double* cPtr = cVector;
195  const float* aPtr = aVector;
196  const double* bPtr = bVector;
197 
198  __m256 aVal;
199  __m128 aVal1, aVal2;
200  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
201  for (; number < eighth_points; number++) {
202 
203  aVal = _mm256_load_ps(aPtr);
204  bVal1 = _mm256_load_pd(bPtr);
205  bVal2 = _mm256_load_pd(bPtr + 4);
206 
207  aVal1 = _mm256_extractf128_ps(aVal, 0);
208  aVal2 = _mm256_extractf128_ps(aVal, 1);
209 
210  aDbl1 = _mm256_cvtps_pd(aVal1);
211  aDbl2 = _mm256_cvtps_pd(aVal2);
212 
213  cVal1 = _mm256_add_pd(aDbl1, bVal1);
214  cVal2 = _mm256_add_pd(aDbl2, bVal2);
215 
216  _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
217  _mm256_store_pd(cPtr + 4,
218  cVal2); // Store the results back into the C container
219 
220  aPtr += 8;
221  bPtr += 8;
222  cPtr += 8;
223  }
224 
225  number = eighth_points * 8;
226  for (; number < num_points; number++) {
227  *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
228  }
229 }
230 
231 #endif /* LV_HAVE_AVX */
232 
233 #endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
static void volk_32f_64f_add_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:186
static void volk_32f_64f_add_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:133
static void volk_32f_64f_add_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_add_64f.h:67
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:71