Vector Optimized Library of Kernels  3.0.0
Architecture-tuned implementations of math kernels
volk_64u_popcnt.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
47 #ifndef INCLUDED_volk_64u_popcnt_a_H
48 #define INCLUDED_volk_64u_popcnt_a_H
49 
50 #include <inttypes.h>
51 #include <stdio.h>
52 
53 
54 #ifdef LV_HAVE_GENERIC
55 
56 
57 static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
58 {
59  // const uint32_t* valueVector = (const uint32_t*)&value;
60 
61  // This is faster than a lookup table
62  // uint32_t retVal = valueVector[0];
63  uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
64 
65  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
66  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
67  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
68  retVal = (retVal + (retVal >> 8));
69  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
70  uint64_t retVal64 = retVal;
71 
72  // retVal = valueVector[1];
73  retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
74  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
75  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
76  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
77  retVal = (retVal + (retVal >> 8));
78  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
79  retVal64 += retVal;
80 
81  *ret = retVal64;
82 }
83 
84 #endif /*LV_HAVE_GENERIC*/
85 
86 
87 #if LV_HAVE_SSE4_2 && LV_HAVE_64
88 
89 #include <nmmintrin.h>
90 
91 static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
92 {
93  *ret = _mm_popcnt_u64(value);
94 }
95 
96 #endif /*LV_HAVE_SSE4_2*/
97 
98 
99 #if LV_HAVE_NEON
100 #include <arm_neon.h>
101 static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value)
102 {
103  uint8x8_t input_val, count8x8_val;
104  uint16x4_t count16x4_val;
105  uint32x2_t count32x2_val;
106  uint64x1_t count64x1_val;
107 
108  input_val = vld1_u8((unsigned char*)&value);
109  count8x8_val = vcnt_u8(input_val);
110  count16x4_val = vpaddl_u8(count8x8_val);
111  count32x2_val = vpaddl_u16(count16x4_val);
112  count64x1_val = vpaddl_u32(count32x2_val);
113  vst1_u64(ret, count64x1_val);
114 
115  //*ret = _mm_popcnt_u64(value);
116 }
117 #endif /*LV_HAVE_NEON*/
118 
119 
120 #endif /*INCLUDED_volk_64u_popcnt_a_H*/
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
Definition: sse2neon.h:8763
static void volk_64u_popcnt_neon(uint64_t *ret, const uint64_t value)
Definition: volk_64u_popcnt.h:101
static void volk_64u_popcnt_generic(uint64_t *ret, const uint64_t value)
Definition: volk_64u_popcnt.h:57