Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_power_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
59 #define INCLUDED_volk_32f_s32f_power_32f_a_H
60 
61 #include <inttypes.h>
62 #include <math.h>
63 #include <stdio.h>
64 
65 #ifdef LV_HAVE_SSE4_1
66 #include <tmmintrin.h>
67 
68 #ifdef LV_HAVE_LIB_SIMDMATH
69 #include <simdmath.h>
70 #endif /* LV_HAVE_LIB_SIMDMATH */
71 
72 static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector,
73  const float* aVector,
74  const float power,
75  unsigned int num_points)
76 {
77  unsigned int number = 0;
78 
79  float* cPtr = cVector;
80  const float* aPtr = aVector;
81 
82 #ifdef LV_HAVE_LIB_SIMDMATH
83  const unsigned int quarterPoints = num_points / 4;
84  __m128 vPower = _mm_set_ps1(power);
85  __m128 zeroValue = _mm_setzero_ps();
86  __m128 signMask;
87  __m128 negatedValues;
88  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
89  __m128 onesMask = _mm_set_ps1(1);
90 
91  __m128 aVal, cVal;
92  for (; number < quarterPoints; number++) {
93 
94  aVal = _mm_load_ps(aPtr);
95  signMask = _mm_cmplt_ps(aVal, zeroValue);
96  negatedValues = _mm_sub_ps(zeroValue, aVal);
97  aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
98 
99  // powf4 doesn't support negative values in the base, so we mask them off and then
100  // apply the negative after
101  cVal = powf4(aVal, vPower); // Takes each input value to the specified power
102 
103  cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
104 
105  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
106 
107  aPtr += 4;
108  cPtr += 4;
109  }
110 
111  number = quarterPoints * 4;
112 #endif /* LV_HAVE_LIB_SIMDMATH */
113 
114  for (; number < num_points; number++) {
115  *cPtr++ = powf((*aPtr++), power);
116  }
117 }
118 
119 #endif /* LV_HAVE_SSE4_1 */
120 
121 
122 #ifdef LV_HAVE_SSE
123 #include <xmmintrin.h>
124 
125 #ifdef LV_HAVE_LIB_SIMDMATH
126 #include <simdmath.h>
127 #endif /* LV_HAVE_LIB_SIMDMATH */
128 
129 static inline void volk_32f_s32f_power_32f_a_sse(float* cVector,
130  const float* aVector,
131  const float power,
132  unsigned int num_points)
133 {
134  unsigned int number = 0;
135 
136  float* cPtr = cVector;
137  const float* aPtr = aVector;
138 
139 #ifdef LV_HAVE_LIB_SIMDMATH
140  const unsigned int quarterPoints = num_points / 4;
141  __m128 vPower = _mm_set_ps1(power);
142  __m128 zeroValue = _mm_setzero_ps();
143  __m128 signMask;
144  __m128 negatedValues;
145  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
146  __m128 onesMask = _mm_set_ps1(1);
147 
148  __m128 aVal, cVal;
149  for (; number < quarterPoints; number++) {
150 
151  aVal = _mm_load_ps(aPtr);
152  signMask = _mm_cmplt_ps(aVal, zeroValue);
153  negatedValues = _mm_sub_ps(zeroValue, aVal);
154  aVal =
155  _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues));
156 
157  // powf4 doesn't support negative values in the base, so we mask them off and then
158  // apply the negative after
159  cVal = powf4(aVal, vPower); // Takes each input value to the specified power
160 
161  cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask),
162  _mm_and_ps(signMask, negativeOneToPower)),
163  cVal);
164 
165  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
166 
167  aPtr += 4;
168  cPtr += 4;
169  }
170 
171  number = quarterPoints * 4;
172 #endif /* LV_HAVE_LIB_SIMDMATH */
173 
174  for (; number < num_points; number++) {
175  *cPtr++ = powf((*aPtr++), power);
176  }
177 }
178 
179 #endif /* LV_HAVE_SSE */
180 
181 
182 #ifdef LV_HAVE_GENERIC
183 
184 static inline void volk_32f_s32f_power_32f_generic(float* cVector,
185  const float* aVector,
186  const float power,
187  unsigned int num_points)
188 {
189  float* cPtr = cVector;
190  const float* aPtr = aVector;
191  unsigned int number = 0;
192 
193  for (number = 0; number < num_points; number++) {
194  *cPtr++ = powf((*aPtr++), power);
195  }
196 }
197 #endif /* LV_HAVE_GENERIC */
198 
199 
200 #endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2834
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: sse2neon.h:2205
FORCE_INLINE __m128 _mm_set_ps1(float)
Definition: sse2neon.h:2437
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_s32f_power_32f_a_sse(float *cVector, const float *aVector, const float power, unsigned int num_points)
Definition: volk_32f_s32f_power_32f.h:129
static void volk_32f_s32f_power_32f_generic(float *cVector, const float *aVector, const float power, unsigned int num_points)
Definition: volk_32f_s32f_power_32f.h:184