Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_64f_multiply_64f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_64f_multiply_64f_H
59 #define INCLUDED_volk_32f_64f_multiply_64f_H
60 
61 #include <inttypes.h>
62 
63 
64 #ifdef LV_HAVE_GENERIC
65 
66 static inline void volk_32f_64f_multiply_64f_generic(double* cVector,
67  const float* aVector,
68  const double* bVector,
69  unsigned int num_points)
70 {
71  double* cPtr = cVector;
72  const float* aPtr = aVector;
73  const double* bPtr = bVector;
74  unsigned int number = 0;
75 
76  for (number = 0; number < num_points; number++) {
77  *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
78  }
79 }
80 
81 #endif /* LV_HAVE_GENERIC */
82 
83 /*
84  * Unaligned versions
85  */
86 
87 
88 #ifdef LV_HAVE_AVX
89 
90 #include <immintrin.h>
91 #include <xmmintrin.h>
92 
93 static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector,
94  const float* aVector,
95  const double* bVector,
96  unsigned int num_points)
97 {
98  unsigned int number = 0;
99  const unsigned int eighth_points = num_points / 8;
100 
101  double* cPtr = cVector;
102  const float* aPtr = aVector;
103  const double* bPtr = bVector;
104 
105  __m256 aVal;
106  __m128 aVal1, aVal2;
107  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
108  for (; number < eighth_points; number++) {
109 
110  aVal = _mm256_loadu_ps(aPtr);
111  bVal1 = _mm256_loadu_pd(bPtr);
112  bVal2 = _mm256_loadu_pd(bPtr + 4);
113 
114  aVal1 = _mm256_extractf128_ps(aVal, 0);
115  aVal2 = _mm256_extractf128_ps(aVal, 1);
116 
117  aDbl1 = _mm256_cvtps_pd(aVal1);
118  aDbl2 = _mm256_cvtps_pd(aVal2);
119 
120  cVal1 = _mm256_mul_pd(aDbl1, bVal1);
121  cVal2 = _mm256_mul_pd(aDbl2, bVal2);
122 
123  _mm256_storeu_pd(cPtr, cVal1); // Store the results back into the C container
124  _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container
125 
126  aPtr += 8;
127  bPtr += 8;
128  cPtr += 8;
129  }
130 
131  number = eighth_points * 8;
132  for (; number < num_points; number++) {
133  *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
134  }
135 }
136 
137 #endif /* LV_HAVE_AVX */
138 
139 
140 #ifdef LV_HAVE_AVX
141 
142 #include <immintrin.h>
143 #include <xmmintrin.h>
144 
145 static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,
146  const float* aVector,
147  const double* bVector,
148  unsigned int num_points)
149 {
150  unsigned int number = 0;
151  const unsigned int eighth_points = num_points / 8;
152 
153  double* cPtr = cVector;
154  const float* aPtr = aVector;
155  const double* bPtr = bVector;
156 
157  __m256 aVal;
158  __m128 aVal1, aVal2;
159  __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
160  for (; number < eighth_points; number++) {
161 
162  aVal = _mm256_load_ps(aPtr);
163  bVal1 = _mm256_load_pd(bPtr);
164  bVal2 = _mm256_load_pd(bPtr + 4);
165 
166  aVal1 = _mm256_extractf128_ps(aVal, 0);
167  aVal2 = _mm256_extractf128_ps(aVal, 1);
168 
169  aDbl1 = _mm256_cvtps_pd(aVal1);
170  aDbl2 = _mm256_cvtps_pd(aVal2);
171 
172  cVal1 = _mm256_mul_pd(aDbl1, bVal1);
173  cVal2 = _mm256_mul_pd(aDbl2, bVal2);
174 
175  _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
176  _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container
177 
178  aPtr += 8;
179  bPtr += 8;
180  cPtr += 8;
181  }
182 
183  number = eighth_points * 8;
184  for (; number < num_points; number++) {
185  *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
186  }
187 }
188 
189 #endif /* LV_HAVE_AVX */
190 
191 
192 #endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */
float32x4_t __m128
Definition: sse2neon.h:235
static void volk_32f_64f_multiply_64f_generic(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:66
static void volk_32f_64f_multiply_64f_u_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:93
static void volk_32f_64f_multiply_64f_a_avx(double *cVector, const float *aVector, const double *bVector, unsigned int num_points)
Definition: volk_32f_64f_multiply_64f.h:145