Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_index_max_16u_a_H
59 #define INCLUDED_volk_32f_index_max_16u_a_H
60 
61 #include <inttypes.h>
62 #include <limits.h>
63 #include <stdio.h>
64 #include <volk/volk_common.h>
65 
66 #ifdef LV_HAVE_AVX
67 #include <immintrin.h>
68 
69 static inline void
70 volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
71 {
72  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
73 
74  uint32_t number = 0;
75  const uint32_t eighthPoints = num_points / 8;
76 
77  float* inputPtr = (float*)src0;
78 
79  __m256 indexIncrementValues = _mm256_set1_ps(8);
80  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
81 
82  float max = src0[0];
83  float index = 0;
84  __m256 maxValues = _mm256_set1_ps(max);
85  __m256 maxValuesIndex = _mm256_setzero_ps();
86  __m256 compareResults;
87  __m256 currentValues;
88 
89  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
90  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
91 
92  for (; number < eighthPoints; number++) {
93 
94  currentValues = _mm256_load_ps(inputPtr);
95  inputPtr += 8;
96  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
97 
98  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
99 
100  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
101  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
102  }
103 
104  // Calculate the largest value from the remaining 4 points
105  _mm256_store_ps(maxValuesBuffer, maxValues);
106  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
107 
108  for (number = 0; number < 8; number++) {
109  if (maxValuesBuffer[number] > max) {
110  index = maxIndexesBuffer[number];
111  max = maxValuesBuffer[number];
112  } else if (maxValuesBuffer[number] == max) {
113  if (index > maxIndexesBuffer[number])
114  index = maxIndexesBuffer[number];
115  }
116  }
117 
118  number = eighthPoints * 8;
119  for (; number < num_points; number++) {
120  if (src0[number] > max) {
121  index = number;
122  max = src0[number];
123  }
124  }
125  target[0] = (uint16_t)index;
126 }
127 
128 #endif /*LV_HAVE_AVX*/
129 
130 #ifdef LV_HAVE_SSE4_1
131 #include <smmintrin.h>
132 
133 static inline void
134 volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
135 {
136  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
137 
138  uint32_t number = 0;
139  const uint32_t quarterPoints = num_points / 4;
140 
141  float* inputPtr = (float*)src0;
142 
143  __m128 indexIncrementValues = _mm_set1_ps(4);
144  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
145 
146  float max = src0[0];
147  float index = 0;
148  __m128 maxValues = _mm_set1_ps(max);
149  __m128 maxValuesIndex = _mm_setzero_ps();
150  __m128 compareResults;
151  __m128 currentValues;
152 
153  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
154  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
155 
156  for (; number < quarterPoints; number++) {
157 
158  currentValues = _mm_load_ps(inputPtr);
159  inputPtr += 4;
160  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
161 
162  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
163 
164  maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
165  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
166  }
167 
168  // Calculate the largest value from the remaining 4 points
169  _mm_store_ps(maxValuesBuffer, maxValues);
170  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
171 
172  for (number = 0; number < 4; number++) {
173  if (maxValuesBuffer[number] > max) {
174  index = maxIndexesBuffer[number];
175  max = maxValuesBuffer[number];
176  } else if (maxValuesBuffer[number] == max) {
177  if (index > maxIndexesBuffer[number])
178  index = maxIndexesBuffer[number];
179  }
180  }
181 
182  number = quarterPoints * 4;
183  for (; number < num_points; number++) {
184  if (src0[number] > max) {
185  index = number;
186  max = src0[number];
187  }
188  }
189  target[0] = (uint16_t)index;
190 }
191 
192 #endif /*LV_HAVE_SSE4_1*/
193 
194 
195 #ifdef LV_HAVE_SSE
196 
197 #include <xmmintrin.h>
198 
199 static inline void
200 volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
201 {
202  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
203 
204  uint32_t number = 0;
205  const uint32_t quarterPoints = num_points / 4;
206 
207  float* inputPtr = (float*)src0;
208 
209  __m128 indexIncrementValues = _mm_set1_ps(4);
210  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
211 
212  float max = src0[0];
213  float index = 0;
214  __m128 maxValues = _mm_set1_ps(max);
215  __m128 maxValuesIndex = _mm_setzero_ps();
216  __m128 compareResults;
217  __m128 currentValues;
218 
219  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
220  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
221 
222  for (; number < quarterPoints; number++) {
223 
224  currentValues = _mm_load_ps(inputPtr);
225  inputPtr += 4;
226  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
227 
228  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
229 
230  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
231  _mm_andnot_ps(compareResults, maxValuesIndex));
232  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
233  _mm_andnot_ps(compareResults, maxValues));
234  }
235 
236  // Calculate the largest value from the remaining 4 points
237  _mm_store_ps(maxValuesBuffer, maxValues);
238  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
239 
240  for (number = 0; number < 4; number++) {
241  if (maxValuesBuffer[number] > max) {
242  index = maxIndexesBuffer[number];
243  max = maxValuesBuffer[number];
244  } else if (maxValuesBuffer[number] == max) {
245  if (index > maxIndexesBuffer[number])
246  index = maxIndexesBuffer[number];
247  }
248  }
249 
250  number = quarterPoints * 4;
251  for (; number < num_points; number++) {
252  if (src0[number] > max) {
253  index = number;
254  max = src0[number];
255  }
256  }
257  target[0] = (uint16_t)index;
258 }
259 
260 #endif /*LV_HAVE_SSE*/
261 
262 
263 #ifdef LV_HAVE_GENERIC
264 
265 static inline void
266 volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
267 {
268  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
269 
270  float max = src0[0];
271  uint16_t index = 0;
272 
273  uint32_t i = 1;
274 
275  for (; i < num_points; ++i) {
276  if (src0[i] > max) {
277  index = i;
278  max = src0[i];
279  }
280  }
281  target[0] = index;
282 }
283 
284 #endif /*LV_HAVE_GENERIC*/
285 
286 
287 #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
288 
289 
290 #ifndef INCLUDED_volk_32f_index_max_16u_u_H
291 #define INCLUDED_volk_32f_index_max_16u_u_H
292 
293 #include <inttypes.h>
294 #include <limits.h>
295 #include <stdio.h>
296 #include <volk/volk_common.h>
297 
298 #ifdef LV_HAVE_AVX
299 #include <immintrin.h>
300 
301 static inline void
302 volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
303 {
304  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
305 
306  uint32_t number = 0;
307  const uint32_t eighthPoints = num_points / 8;
308 
309  float* inputPtr = (float*)src0;
310 
311  __m256 indexIncrementValues = _mm256_set1_ps(8);
312  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
313 
314  float max = src0[0];
315  float index = 0;
316  __m256 maxValues = _mm256_set1_ps(max);
317  __m256 maxValuesIndex = _mm256_setzero_ps();
318  __m256 compareResults;
319  __m256 currentValues;
320 
321  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
322  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
323 
324  for (; number < eighthPoints; number++) {
325 
326  currentValues = _mm256_loadu_ps(inputPtr);
327  inputPtr += 8;
328  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
329 
330  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
331 
332  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
333  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
334  }
335 
336  // Calculate the largest value from the remaining 4 points
337  _mm256_storeu_ps(maxValuesBuffer, maxValues);
338  _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
339 
340  for (number = 0; number < 8; number++) {
341  if (maxValuesBuffer[number] > max) {
342  index = maxIndexesBuffer[number];
343  max = maxValuesBuffer[number];
344  } else if (maxValuesBuffer[number] == max) {
345  if (index > maxIndexesBuffer[number])
346  index = maxIndexesBuffer[number];
347  }
348  }
349 
350  number = eighthPoints * 8;
351  for (; number < num_points; number++) {
352  if (src0[number] > max) {
353  index = number;
354  max = src0[number];
355  }
356  }
357  target[0] = (uint16_t)index;
358 }
359 
360 #endif /*LV_HAVE_AVX*/
361 
362 #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
float32x4_t __m128
Definition: sse2neon.h:235
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
Definition: sse2neon.h:2429
FORCE_INLINE __m128 _mm_set1_ps(float _w)
Definition: sse2neon.h:2503
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1154
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1079
FORCE_INLINE __m128 _mm_setzero_ps(void)
Definition: sse2neon.h:2531
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1064
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
Definition: sse2neon.h:7458
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
Definition: sse2neon.h:1039
FORCE_INLINE __m128 _mm_load_ps(const float *p)
Definition: sse2neon.h:1858
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
Definition: sse2neon.h:2704
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128)
Definition: sse2neon.h:2237
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:302
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:70
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:266
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:200
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:65
for i
Definition: volk_config_fixed.tmpl.h:13