Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_64f_x2_max_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_64f_x2_max_64f_a_H
59 #define INCLUDED_volk_64f_x2_max_64f_a_H
60 
61 #include <inttypes.h>
62 #include <stdio.h>
63 
64 #ifdef LV_HAVE_AVX512F
65 #include <immintrin.h>
66 
67 static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector,
68  const double* aVector,
69  const double* bVector,
70  unsigned int num_points)
71 {
72  unsigned int number = 0;
73  const unsigned int eigthPoints = num_points / 8;
74 
75  double* cPtr = cVector;
76  const double* aPtr = aVector;
77  const double* bPtr = bVector;
78 
79  __m512d aVal, bVal, cVal;
80  for (; number < eigthPoints; number++) {
81 
82  aVal = _mm512_load_pd(aPtr);
83  bVal = _mm512_load_pd(bPtr);
84 
85  cVal = _mm512_max_pd(aVal, bVal);
86 
87  _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
88 
89  aPtr += 8;
90  bPtr += 8;
91  cPtr += 8;
92  }
93 
94  number = eigthPoints * 8;
95  for (; number < num_points; number++) {
96  const double a = *aPtr++;
97  const double b = *bPtr++;
98  *cPtr++ = (a > b ? a : b);
99  }
100 }
101 #endif /* LV_HAVE_AVX512F */
102 
103 
104 #ifdef LV_HAVE_AVX
105 #include <immintrin.h>
106 
107 static inline void volk_64f_x2_max_64f_a_avx(double* cVector,
108  const double* aVector,
109  const double* bVector,
110  unsigned int num_points)
111 {
112  unsigned int number = 0;
113  const unsigned int quarterPoints = num_points / 4;
114 
115  double* cPtr = cVector;
116  const double* aPtr = aVector;
117  const double* bPtr = bVector;
118 
119  __m256d aVal, bVal, cVal;
120  for (; number < quarterPoints; number++) {
121 
122  aVal = _mm256_load_pd(aPtr);
123  bVal = _mm256_load_pd(bPtr);
124 
125  cVal = _mm256_max_pd(aVal, bVal);
126 
127  _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
128 
129  aPtr += 4;
130  bPtr += 4;
131  cPtr += 4;
132  }
133 
134  number = quarterPoints * 4;
135  for (; number < num_points; number++) {
136  const double a = *aPtr++;
137  const double b = *bPtr++;
138  *cPtr++ = (a > b ? a : b);
139  }
140 }
141 #endif /* LV_HAVE_AVX */
142 
143 
144 #ifdef LV_HAVE_SSE2
145 #include <emmintrin.h>
146 
147 static inline void volk_64f_x2_max_64f_a_sse2(double* cVector,
148  const double* aVector,
149  const double* bVector,
150  unsigned int num_points)
151 {
152  unsigned int number = 0;
153  const unsigned int halfPoints = num_points / 2;
154 
155  double* cPtr = cVector;
156  const double* aPtr = aVector;
157  const double* bPtr = bVector;
158 
159  __m128d aVal, bVal, cVal;
160  for (; number < halfPoints; number++) {
161 
162  aVal = _mm_load_pd(aPtr);
163  bVal = _mm_load_pd(bPtr);
164 
165  cVal = _mm_max_pd(aVal, bVal);
166 
167  _mm_store_pd(cPtr, cVal); // Store the results back into the C container
168 
169  aPtr += 2;
170  bPtr += 2;
171  cPtr += 2;
172  }
173 
174  number = halfPoints * 2;
175  for (; number < num_points; number++) {
176  const double a = *aPtr++;
177  const double b = *bPtr++;
178  *cPtr++ = (a > b ? a : b);
179  }
180 }
181 #endif /* LV_HAVE_SSE2 */
182 
183 
184 #ifdef LV_HAVE_GENERIC
185 
186 static inline void volk_64f_x2_max_64f_generic(double* cVector,
187  const double* aVector,
188  const double* bVector,
189  unsigned int num_points)
190 {
191  double* cPtr = cVector;
192  const double* aPtr = aVector;
193  const double* bPtr = bVector;
194  unsigned int number = 0;
195 
196  for (number = 0; number < num_points; number++) {
197  const double a = *aPtr++;
198  const double b = *bPtr++;
199  *cPtr++ = (a > b ? a : b);
200  }
201 }
202 #endif /* LV_HAVE_GENERIC */
203 
204 
205 #endif /* INCLUDED_volk_64f_x2_max_64f_a_H */
206 
207 
208 #ifndef INCLUDED_volk_64f_x2_max_64f_u_H
209 #define INCLUDED_volk_64f_x2_max_64f_u_H
210 
211 #include <inttypes.h>
212 #include <stdio.h>
213 
214 #ifdef LV_HAVE_AVX512F
215 #include <immintrin.h>
216 
217 static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector,
218  const double* aVector,
219  const double* bVector,
220  unsigned int num_points)
221 {
222  unsigned int number = 0;
223  const unsigned int eigthPoints = num_points / 8;
224 
225  double* cPtr = cVector;
226  const double* aPtr = aVector;
227  const double* bPtr = bVector;
228 
229  __m512d aVal, bVal, cVal;
230  for (; number < eigthPoints; number++) {
231 
232  aVal = _mm512_loadu_pd(aPtr);
233  bVal = _mm512_loadu_pd(bPtr);
234 
235  cVal = _mm512_max_pd(aVal, bVal);
236 
237  _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
238 
239  aPtr += 8;
240  bPtr += 8;
241  cPtr += 8;
242  }
243 
244  number = eigthPoints * 8;
245  for (; number < num_points; number++) {
246  const double a = *aPtr++;
247  const double b = *bPtr++;
248  *cPtr++ = (a > b ? a : b);
249  }
250 }
251 #endif /* LV_HAVE_AVX512F */
252 
253 
254 #ifdef LV_HAVE_AVX
255 #include <immintrin.h>
256 
257 static inline void volk_64f_x2_max_64f_u_avx(double* cVector,
258  const double* aVector,
259  const double* bVector,
260  unsigned int num_points)
261 {
262  unsigned int number = 0;
263  const unsigned int quarterPoints = num_points / 4;
264 
265  double* cPtr = cVector;
266  const double* aPtr = aVector;
267  const double* bPtr = bVector;
268 
269  __m256d aVal, bVal, cVal;
270  for (; number < quarterPoints; number++) {
271 
272  aVal = _mm256_loadu_pd(aPtr);
273  bVal = _mm256_loadu_pd(bPtr);
274 
275  cVal = _mm256_max_pd(aVal, bVal);
276 
277  _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
278 
279  aPtr += 4;
280  bPtr += 4;
281  cPtr += 4;
282  }
283 
284  number = quarterPoints * 4;
285  for (; number < num_points; number++) {
286  const double a = *aPtr++;
287  const double b = *bPtr++;
288  *cPtr++ = (a > b ? a : b);
289  }
290 }
291 #endif /* LV_HAVE_AVX */
292 
293 
294 #endif /* INCLUDED_volk_64f_x2_max_64f_u_H */
FORCE_INLINE __m128d _mm_load_pd(const double *p)
Definition: sse2neon.h:4430
float32x4_t __m128d
Definition: sse2neon.h:242
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
Definition: sse2neon.h:4644
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
Definition: sse2neon.h:5897
static void volk_64f_x2_max_64f_a_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_max_64f.h:107
static void volk_64f_x2_max_64f_u_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_max_64f.h:257
static void volk_64f_x2_max_64f_a_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_max_64f.h:147
static void volk_64f_x2_max_64f_generic(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_max_64f.h:186