Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_index_min_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_index_min_16u_a_H
59 #define INCLUDED_volk_32f_index_min_16u_a_H
60 
61 #include <inttypes.h>
62 #include <limits.h>
63 #include <stdio.h>
64 #include <volk/volk_common.h>
65 
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68 
69 static inline void
70 volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num_points)
71 {
72  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
73  const uint32_t eighthPoints = num_points / 8;
74 
75  float* inputPtr = (float*)source;
76 
77  __m256 indexIncrementValues = _mm256_set1_ps(8);
78  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
79 
80  float min = source[0];
81  float index = 0;
82  __m256 minValues = _mm256_set1_ps(min);
83  __m256 minValuesIndex = _mm256_setzero_ps();
84  __m256 compareResults;
85  __m256 currentValues;
86 
87  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
88  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
89 
90  for (uint32_t number = 0; number < eighthPoints; number++) {
91 
92  currentValues = _mm256_load_ps(inputPtr);
93  inputPtr += 8;
94  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
95 
96  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
97 
98  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
99  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
100  }
101 
102  // Calculate the smallest value from the remaining 4 points
103  _mm256_store_ps(minValuesBuffer, minValues);
104  _mm256_store_ps(minIndexesBuffer, minValuesIndex);
105 
106  for (uint32_t number = 0; number < 8; number++) {
107  if (minValuesBuffer[number] < min) {
108  index = minIndexesBuffer[number];
109  min = minValuesBuffer[number];
110  } else if (minValuesBuffer[number] == min) {
111  if (index > minIndexesBuffer[number])
112  index = minIndexesBuffer[number];
113  }
114  }
115 
116  for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
117  if (source[number] < min) {
118  index = number;
119  min = source[number];
120  }
121  }
122  target[0] = (uint16_t)index;
123 }
124 
125 #endif /*LV_HAVE_AVX*/
126 
127 #ifdef LV_HAVE_SSE4_1
128 #include <smmintrin.h>
129 
130 static inline void volk_32f_index_min_16u_a_sse4_1(uint16_t* target,
131  const float* source,
132  uint32_t num_points)
133 {
134  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
135  const uint32_t quarterPoints = num_points / 4;
136 
137  float* inputPtr = (float*)source;
138 
139  __m128 indexIncrementValues = _mm_set1_ps(4);
140  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
141 
142  float min = source[0];
143  float index = 0;
144  __m128 minValues = _mm_set1_ps(min);
145  __m128 minValuesIndex = _mm_setzero_ps();
146  __m128 compareResults;
147  __m128 currentValues;
148 
149  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
150  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
151 
152  for (uint32_t number = 0; number < quarterPoints; number++) {
153 
154  currentValues = _mm_load_ps(inputPtr);
155  inputPtr += 4;
156  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
157 
158  compareResults = _mm_cmplt_ps(currentValues, minValues);
159 
160  minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
161  minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
162  }
163 
164  // Calculate the smallest value from the remaining 4 points
165  _mm_store_ps(minValuesBuffer, minValues);
166  _mm_store_ps(minIndexesBuffer, minValuesIndex);
167 
168  for (uint32_t number = 0; number < 4; number++) {
169  if (minValuesBuffer[number] < min) {
170  index = minIndexesBuffer[number];
171  min = minValuesBuffer[number];
172  } else if (minValuesBuffer[number] == min) {
173  if (index > minIndexesBuffer[number])
174  index = minIndexesBuffer[number];
175  }
176  }
177 
178  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
179  if (source[number] < min) {
180  index = number;
181  min = source[number];
182  }
183  }
184  target[0] = (uint16_t)index;
185 }
186 
187 #endif /*LV_HAVE_SSE4_1*/
188 
189 
190 #ifdef LV_HAVE_SSE
191 
192 #include <xmmintrin.h>
193 
194 static inline void
195 volk_32f_index_min_16u_a_sse(uint16_t* target, const float* source, uint32_t num_points)
196 {
197  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
198  const uint32_t quarterPoints = num_points / 4;
199 
200  float* inputPtr = (float*)source;
201 
202  __m128 indexIncrementValues = _mm_set1_ps(4);
203  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
204 
205  float min = source[0];
206  float index = 0;
207  __m128 minValues = _mm_set1_ps(min);
208  __m128 minValuesIndex = _mm_setzero_ps();
209  __m128 compareResults;
210  __m128 currentValues;
211 
212  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
213  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
214 
215  for (uint32_t number = 0; number < quarterPoints; number++) {
216 
217  currentValues = _mm_load_ps(inputPtr);
218  inputPtr += 4;
219  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
220 
221  compareResults = _mm_cmplt_ps(currentValues, minValues);
222 
223  minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
224  _mm_andnot_ps(compareResults, minValuesIndex));
225  minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
226  _mm_andnot_ps(compareResults, minValues));
227  }
228 
229  // Calculate the smallest value from the remaining 4 points
230  _mm_store_ps(minValuesBuffer, minValues);
231  _mm_store_ps(minIndexesBuffer, minValuesIndex);
232 
233  for (uint32_t number = 0; number < 4; number++) {
234  if (minValuesBuffer[number] < min) {
235  index = minIndexesBuffer[number];
236  min = minValuesBuffer[number];
237  } else if (minValuesBuffer[number] == min) {
238  if (index > minIndexesBuffer[number])
239  index = minIndexesBuffer[number];
240  }
241  }
242 
243  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
244  if (source[number] < min) {
245  index = number;
246  min = source[number];
247  }
248  }
249  target[0] = (uint16_t)index;
250 }
251 
252 #endif /*LV_HAVE_SSE*/
253 
254 
255 #ifdef LV_HAVE_GENERIC
256 
257 static inline void
258 volk_32f_index_min_16u_generic(uint16_t* target, const float* source, uint32_t num_points)
259 {
260  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
261 
262  float min = source[0];
263  uint16_t index = 0;
264 
265  for (uint32_t i = 1; i < num_points; ++i) {
266  if (source[i] < min) {
267  index = i;
268  min = source[i];
269  }
270  }
271  target[0] = index;
272 }
273 
274 #endif /*LV_HAVE_GENERIC*/
275 
276 
277 #endif /*INCLUDED_volk_32f_index_min_16u_a_H*/
278 
279 
280 #ifndef INCLUDED_volk_32f_index_min_16u_u_H
281 #define INCLUDED_volk_32f_index_min_16u_u_H
282 
283 #include <inttypes.h>
284 #include <limits.h>
285 #include <stdio.h>
286 #include <volk/volk_common.h>
287 
288 #ifdef LV_HAVE_AVX
289 #include <immintrin.h>
290 
291 static inline void
292 volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num_points)
293 {
294  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
295  const uint32_t eighthPoints = num_points / 8;
296 
297  float* inputPtr = (float*)source;
298 
299  __m256 indexIncrementValues = _mm256_set1_ps(8);
300  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
301 
302  float min = source[0];
303  float index = 0;
304  __m256 minValues = _mm256_set1_ps(min);
305  __m256 minValuesIndex = _mm256_setzero_ps();
306  __m256 compareResults;
307  __m256 currentValues;
308 
309  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
310  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
311 
312  for (uint32_t number = 0; number < eighthPoints; number++) {
313 
314  currentValues = _mm256_loadu_ps(inputPtr);
315  inputPtr += 8;
316  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
317 
318  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
319 
320  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
321  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
322  }
323 
324  // Calculate the smallest value from the remaining 4 points
325  _mm256_storeu_ps(minValuesBuffer, minValues);
326  _mm256_storeu_ps(minIndexesBuffer, minValuesIndex);
327 
328  for (uint32_t number = 0; number < 8; number++) {
329  if (minValuesBuffer[number] < min) {
330  index = minIndexesBuffer[number];
331  min = minValuesBuffer[number];
332  } else if (minValuesBuffer[number] == min) {
333  if (index > minIndexesBuffer[number])
334  index = minIndexesBuffer[number];
335  }
336  }
337 
338  for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
339  if (source[number] < min) {
340  index = number;
341  min = source[number];
342  }
343  }
344  target[0] = (uint16_t)index;
345 }
346 
347 #endif /*LV_HAVE_AVX*/
348 
349 #endif /*INCLUDED_volk_32f_index_min_16u_u_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1190
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_index_min_16u_a_avx(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:70
static void volk_32f_index_min_16u_generic(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:258
static void volk_32f_index_min_16u_a_sse(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:195
static void volk_32f_index_min_16u_u_avx(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:292
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
for i
Definition: volk_config_fixed.tmpl.h:13