libjmmcg  release_579_6_g8cffd
A C++ library containing an eclectic mix of useful, advanced components.
count_setbits.hpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Copyright © 2012 by J.M.McGuiness, coder@hussar.me.uk
3 **
4 ** This library is free software; you can redistribute it and/or
5 ** modify it under the terms of the GNU Lesser General Public
6 ** License as published by the Free Software Foundation; either
7 ** version 2.1 of the License, or (at your option) any later version.
8 **
9 ** This library is distributed in the hope that it will be useful,
10 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
11 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 ** Lesser General Public License for more details.
13 **
14 ** You should have received a copy of the GNU Lesser General Public
15 ** License along with this library; if not, write to the Free Software
16 ** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18 
20 #include "config.h"
21 
22 #include <boost/static_assert.hpp>
23 
24 #include <array>
25 #include <limits>
26 
27 namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace mpl {
28 
29 /// Count the number of set bits in the compile-time constant, input number.
30 /**
31  Because the operator>>() is poorly defined this only works for unsigned types. This is because there may be a sign bit or two's complement representation of the negative number. Then shifting might cause the sign bit to be shifted into the number itself, possibly causing an infinite loop.
32 
33  Complexity: compile-time: O(n) where n is at most the number of bits used to represent the input type.
34  run-time: O(1)
35  Space: O(1)
36 */
37 template<unsigned long long Val>
38 struct count_setbits {
39  typedef unsigned long long element_type;
40 
41  constexpr static element_type number=Val;
42 
43  enum : element_type {
44  value=(count_setbits<(Val>>1u)>::value+(Val&1u))
45  };
46 
47  constexpr static float efficiency() noexcept(true) {
48  return 1.0f;
49  }
50 };
51 /**
52  We can exit early if the number is shifted to zero.
53 */
54 template<>
55 struct count_setbits<0u> {
56  typedef unsigned long long element_type;
57 
58  constexpr static element_type number=0;
59 
60  enum : element_type {
62  };
63 };
64 
65 }
66 
67 namespace dyn {
68 
69 namespace private_ {
70 
71 template<class T, T... args>
72 struct array_t {
73  typedef std::array<T, sizeof...(args)> container_type;
74  static const container_type value;
75 };
76 
77 template<class T, T... args>
78 const typename array_t<T, args...>::container_type array_t<T, args...>::value={{args...}};
79 
80 }
81 
82 namespace basic {
83 
84 /// Count the number of set bits in the input number.
85 /**
86  Because the operator>>() is poorly defined this only works for unsigned types. This is because there may be a sign bit or two's complement representation of the negative number. Then shifting might cause the sign bit to be shifted into the number itself, possibly causing an infinite loop.
87 
88  Complexity: run-time: O(n) where n is at most the number of bits used to represent the input type.
89  Space: O(1)
90 
91  Also have a look at: <a href="https://graphics.stanford.edu/~seander/bithacks.html"/>
92 */
93 struct count_setbits {
94  typedef unsigned long long element_type;
95 
96  /**
97  A very simple loop-based implementation, with no lookups nor unrolling.
98  */
99  static __attribute__((const)) element_type result(element_type num) noexcept(true) {
100  element_type count=0;
101  do {
102  if (LIKELY(num&1)) {
103  ++count;
104  }
105  } while (num>>=1);
106  return count;
107  }
108 
109  constexpr static float efficiency() noexcept(true) {
110  return 1.0f;
111  }
112 };
113 
114 }
115 
116 namespace builtin {
117 
118 /// Count the number of set bits in the input number.
119 /**
120  Complexity: run-time: O(1) where n is at most the number of bits used to represent the input type.
121  Space: O(1)
122 
123  Also have a look at: <a href="http://www-graphics.stanford.edu/~seander/bithacks.html"/>
124 */
126  typedef unsigned long long element_type;
127 
128  /**
129  A very simple loop-based implementation, with no lookups nor unrolling.
130  */
131  static __attribute__((const)) element_type result(element_type num) noexcept(true) {
132  return POPCOUNTLL(num);
133  }
134 
135  constexpr static float efficiency() noexcept(true) {
136  return 1.0f;
137  }
138 };
139 
140 }
141 
142 namespace lookup {
143 
144 namespace private_ {
145 
146 template<unsigned long long Val, template<unsigned long long> class Fn, class T, T... ct_bits>
147 struct gen_nums {
148  enum {
149  num_bits_set=Fn<Val>::value
150  };
151  typedef typename gen_nums<(Val-1), Fn, T, num_bits_set, ct_bits...>::type type;
152 };
153 template<template<unsigned long long> class Fn, class T, T... ct_bits>
154 struct gen_nums<0ULL, Fn, T, ct_bits...> {
155  typedef dyn::private_::array_t<T, Fn<0ULL>::value, ct_bits...> type;
156 };
157 
158 template<u_int8_t Chars>
159 struct bits_to_type;
160 template<>
161 struct bits_to_type<1> {
162  typedef u_int8_t type;
163 };
164 template<>
165 struct bits_to_type<2> {
166  typedef u_int16_t type;
167 };
168 template<>
169 struct bits_to_type<3> {
170  typedef u_int32_t type;
171 };
172 template<>
173 struct bits_to_type<4> {
174  typedef u_int32_t type;
175 };
176 template<>
177 struct bits_to_type<5> {
178  typedef u_int64_t type;
179 };
180 template<>
181 struct bits_to_type<6> {
182  typedef u_int64_t type;
183 };
184 template<>
185 struct bits_to_type<7> {
186  typedef u_int64_t type;
187 };
188 template<>
189 struct bits_to_type<8> {
190  typedef u_int64_t type;
191 };
192 
193 template<u_int8_t NumBits>
194 struct cache {
195  enum {
196  max_size=NumBits,
197  num_chars=((max_size+7)/8)
198  };
199  typedef typename bits_to_type<num_chars>::type range_type;
201  typedef typename type::container_type container_type;
202 
203  /**
204  \return If ratio of the number of bits requested in the cache and the actual number of bits required to represent that number of bits. For example if 8, 16, 32 or 64 bits are requested, then the efficiency will be 1. If the number of bits requested is 33 then the efficiency will be 33/64, i.e. lots of wasted bits will be required.
205  */
206  constexpr static float efficiency() noexcept(true) {
207  return static_cast<float>(max_size)/(num_chars*8);
208  }
209 };
210 
211 }
212 
213 /// Count the number of set bits in the input number.
214 /**
215  Because the operator>>() is poorly defined this only works for unsigned types. This is because there may be a sign bit or two's complement representation of the negative number. Then shifting might cause the sign bit to be shifted into the number itself, possibly causing an infinite loop.
216 
217  Complexity: run-time: O(n/s) where n is at most the number of bits used to represent the input type, and s is the number of bits used to represent the size of the cache.
218  Space: O(s)
219 */
220 template<
221  u_int8_t NumBits
222 >
224 private:
225  /**
226  This cache is generated at compile-time to maintain performance of the algorithm. One hopes that the cache can fit within the L1-cache, or at least be rapidly loaded into there, so avoiding anything larger than unsigned short is probably a good idea.
227  */
228  typedef private_::cache<NumBits> cache_t;
229 
230 public:
231  typedef unsigned long long element_type;
232 
233  BOOST_STATIC_ASSERT((sizeof(element_type)*8)<=std::numeric_limits<typename cache_t::range_type>::max());
234 
235  /**
236  A loop-based implementation, using a cache. Note that this implementation assumes that the % and / operations when using divisors that are powers of 2 is fast either because:
237  - The compiler is a good optimising compiler and can covert the % & / to shift operations or,
238  - the core has either at least one pipelined execution unit that can perform % and / or two execution units that can each perform either % or /.
239  */
240  static __attribute__((pure)) element_type result(element_type num) noexcept(true) {
241  element_type count=0;
242  do {
243  count+=cache_t::type::value[num%cache_t::max_size];
244  } while (num/=cache_t::max_size);
245  return count;
246  }
247 
248  constexpr static float efficiency() noexcept(true) {
249  return cache_t::efficiency();
250  }
251 };
252 
253 }
254 
255 namespace unroll {
256 
257 namespace private_ {
258 
259 template<unsigned long long Val, template<unsigned long long> class Fn, unsigned long long... bitmasks>
260 struct gen_bitmasks {
261  typedef typename gen_bitmasks<Fn<Val>::value, Fn, Fn<Val>::value, bitmasks...>::type type;
262 };
263 template<template<unsigned long long> class Fn, unsigned long long... bitmasks>
264 struct gen_bitmasks<0ULL, Fn, bitmasks...> {
265  typedef dyn::private_::array_t<unsigned long long, bitmasks...> type;
266 };
267 
268 template<unsigned long long Val>
269 struct shifter {
270  constexpr static unsigned long long value=(Val>>1);
271 };
272 template<>
273 struct shifter<0ULL> {
274  constexpr static unsigned long long value=0ULL;
275 };
276 
277 template<u_int8_t Val, class BitSet>
278 struct unroller : unroller<Val-1, BitSet> {
279  typedef unroller<Val-1, BitSet> base_t;
280  typedef u_int8_t element_type;
281 
282  template<class T>
283  constexpr static element_type result(T num) noexcept(true) {
284  return ((num & BitSet::value[Val]) ? 1 : 0 ) + base_t::result(num);
285  }
286 };
287 template<class BitSet>
288 struct unroller<0, BitSet> {
289  typedef u_int8_t element_type;
290 
291  template<class T>
292  constexpr static element_type result(T) noexcept(true) {
293  return 0;
294  }
295 };
296 
297 }
298 
299 /// Count the number of set bits in the input number.
300 /**
301  Because the operator>>() is poorly defined this only works for unsigned types. This is because there may be a sign bit or two's complement representation of the negative number. Then shifting might cause the sign bit to be shifted into the number itself, possibly causing an infinite loop.
302 
303  Complexity: run-time: O(n) where n is at most the number of bits used to represent the input type.
304 */
306  typedef unsigned long long element_type;
307 
308 private:
309  typedef private_::gen_bitmasks<1ULL<<((sizeof(element_type)*8)-1), private_::shifter>::type bitmasks;
310  typedef private_::unroller<bitmasks::value.size()-1, bitmasks> unroller_t;
311 
312 public:
313  /**
314  A fully-unrolled loop-based implementation. This assumes that the compiler is clever enough to inline all of the recursively generated functions, and then re-roll the repeated sequence into an optimal loop for the specified architecture. But also that the processor has enough bitwise-& functional units for the re-rolled loops to contain enough overlapped bitwise-& operations to make it efficient. Moreover the compiler knows the loop bounds statically, so may be able to generate branch-prediction indicators to ensure that the loop is optimally executed.
315  */
316  constexpr static element_type result(element_type num) noexcept(true) {
317  return unroller_t::result(num);
318  }
319 
320  constexpr static float efficiency() noexcept(true) {
321  return 1.0f;
322  }
323 };
324 
325 } } } }