doxygen/html/count__setbits_8hpp_source.html

/******************************************************************************

** Copyright © 2012 by J.M.McGuiness, coder@hussar.me.uk

**

** This library is free software; you can redistribute it and/or

** modify it under the terms of the GNU Lesser General Public

** License as published by the Free Software Foundation; either

** version 2.1 of the License, or (at your option) any later version.

**

** This library is distributed in the hope that it will be useful,

** but WITHOUT ANY WARRANTY; without even the implied warranty of

** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

** Lesser General Public License for more details.

**

** You should have received a copy of the GNU Lesser General Public

** License along with this library; if not, write to the Free Software

** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

*/


#include "blatant_old_msvc_compiler_hacks.hpp"

#include "config.h"


#include <boost/static_assert.hpp>


#include <array>

#include <limits>


namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace mpl {


/// Count the number of set bits in the compile-time constant, input number.

/**

   Because the operator>>() is poorly defined this only works for unsigned types. This is because there may be a sign bit or two's complement representation of the negative number. Then shifting might cause the sign bit to be shifted into  the number itself, possibly causing an infinite loop.


   Complexity: compile-time: O(n) where n is at most the number of bits used to represent the input type.

   run-time: O(1)

   Space: O(1)

*/

template<unsigned long long Val>

struct count_setbits {

   typedef unsigned long long element_type;


   constexpr static element_type number=Val;


   enum : element_type {

      value=(count_setbits<(Val>>1u)>::value+(Val&1u))

   };


   constexpr static float efficiency() noexcept(true) {

      return 1.0f;

   }

};

/**

   We can exit early if the number is shifted to zero.

*/

template<>

struct count_setbits<0u> {

   typedef unsigned long long element_type;


   constexpr static element_type number=0;


   enum : element_type {

      value=element_type()

   };

};


}


namespace dyn {


namespace private_ {


template<class T, T... args>

struct array_t {

   typedef std::array<T, sizeof...(args)> container_type;

   static const container_type value;

};


template<class T, T... args>

const typename array_t<T, args...>::container_type array_t<T, args...>::value={{args...}};


}


namespace basic {


/// Count the number of set bits in the input number.

/**

   Because the operator>>() is poorly defined this only works for unsigned types. This is because there may be a sign bit or two's complement representation of the negative number. Then shifting might cause the sign bit to be shifted into  the number itself, possibly causing an infinite loop.


   Complexity: run-time: O(n) where n is at most the number of bits used to represent the input type.

   Space: O(1)


   Also have a look at: <a href="https://graphics.stanford.edu/~seander/bithacks.html"/>

*/

struct count_setbits {

   typedef unsigned long long element_type;


   /**

      A very simple loop-based implementation, with no lookups nor unrolling.

   */

   static __attribute__((const)) element_type result(element_type num) noexcept(true) {

      element_type count=0;

      do {

         if (LIKELY(num&1)) {

            ++count;

         }

      } while (num>>=1);

      return count;

   }


   constexpr static float efficiency() noexcept(true) {

      return 1.0f;

   }

};


}


namespace builtin {


/// Count the number of set bits in the input number.

/**

   Complexity: run-time: O(1) where n is at most the number of bits used to represent the input type.

   Space: O(1)


   Also have a look at: <a href="http://www-graphics.stanford.edu/~seander/bithacks.html"/>

*/

struct count_setbits {

   typedef unsigned long long element_type;


   /**

      A very simple loop-based implementation, with no lookups nor unrolling.

   */

   static __attribute__((const)) element_type result(element_type num) noexcept(true) {

      return POPCOUNTLL(num);

   }


   constexpr static float efficiency() noexcept(true) {

      return 1.0f;

   }

};


}


namespace lookup {


namespace private_ {


template<unsigned long long Val, template<unsigned long long> class Fn, class T, T... ct_bits>

struct gen_nums {

   enum {

      num_bits_set=Fn<Val>::value

   };

   typedef typename gen_nums<(Val-1), Fn, T, num_bits_set, ct_bits...>::type type;

};

template<template<unsigned long long> class Fn, class T, T... ct_bits>

struct gen_nums<0ULL, Fn, T, ct_bits...> {

   typedef dyn::private_::array_t<T, Fn<0ULL>::value, ct_bits...> type;

};


template<u_int8_t Chars>

struct bits_to_type;

template<>

struct bits_to_type<1> {

   typedef u_int8_t type;

};

template<>

struct bits_to_type<2> {

   typedef u_int16_t type;

};

template<>

struct bits_to_type<3> {

   typedef u_int32_t type;

};

template<>

struct bits_to_type<4> {

   typedef u_int32_t type;

};

template<>

struct bits_to_type<5> {

   typedef u_int64_t type;

};

template<>

struct bits_to_type<6> {

   typedef u_int64_t type;

};

template<>

struct bits_to_type<7> {

   typedef u_int64_t type;

};

template<>

struct bits_to_type<8> {

   typedef u_int64_t type;

};


template<u_int8_t NumBits>

struct cache {

   enum {

      max_size=NumBits,

      num_chars=((max_size+7)/8)

   };

   typedef typename bits_to_type<num_chars>::type range_type;

   typedef typename gen_nums<max_size, mpl::count_setbits, range_type>::type type;

   typedef typename type::container_type container_type;


   /**

      \return  If ratio of the number of bits requested in the cache and the actual number of bits required to represent that number of bits. For example if 8, 16, 32 or 64 bits are requested, then the efficiency will be 1. If the number of bits requested is  33 then the efficiency will be 33/64, i.e. lots of wasted bits will be required.

   */

   constexpr static float efficiency() noexcept(true) {

      return static_cast<float>(max_size)/(num_chars*8);

   }

};


}


/// Count the number of set bits in the input number.

/**

   Because the operator>>() is poorly defined this only works for unsigned types. This is because there may be a sign bit or two's complement representation of the negative number. Then shifting might cause the sign bit to be shifted into  the number itself, possibly causing an infinite loop.


   Complexity: run-time: O(n/s) where n is at most the number of bits used to represent the input type, and s is the number of bits used to represent the size of the cache.

   Space: O(s)

*/

template<

   u_int8_t NumBits

>

struct count_setbits {

private:

   /**

      This cache is generated at compile-time to maintain performance of the algorithm. One hopes that the cache can fit within the L1-cache, or at least be rapidly loaded into there, so avoiding anything larger than unsigned short is probably a good idea.

   */

   typedef private_::cache<NumBits> cache_t;


public:

   typedef unsigned long long element_type;


   BOOST_STATIC_ASSERT((sizeof(element_type)*8)<=std::numeric_limits<typename cache_t::range_type>::max());


   /**

      A loop-based implementation, using a cache. Note that this implementation assumes that the % and / operations when using divisors that are powers of 2 is fast either because:

      - The compiler is a good optimising compiler and can covert the % & / to shift operations or,

      - the core has either at least one pipelined execution unit that can perform % and / or two execution units that can each perform either % or /.

   */

   static __attribute__((pure)) element_type result(element_type num) noexcept(true) {

      element_type count=0;

      do {

         count+=cache_t::type::value[num%cache_t::max_size];

      } while (num/=cache_t::max_size);

      return count;

   }


   constexpr static float efficiency() noexcept(true) {

      return cache_t::efficiency();

   }

};


}


namespace unroll {


namespace private_ {


template<unsigned long long Val, template<unsigned long long> class Fn, unsigned long long... bitmasks>

struct gen_bitmasks {

   typedef typename gen_bitmasks<Fn<Val>::value, Fn, Fn<Val>::value, bitmasks...>::type type;

};

template<template<unsigned long long> class Fn, unsigned long long... bitmasks>

struct gen_bitmasks<0ULL, Fn, bitmasks...> {

   typedef dyn::private_::array_t<unsigned long long, bitmasks...> type;

};


template<unsigned long long Val>

struct shifter {

   constexpr static unsigned long long value=(Val>>1);

};

template<>

struct shifter<0ULL> {

   constexpr static unsigned long long value=0ULL;

};


template<u_int8_t Val, class BitSet>

struct unroller : unroller<Val-1, BitSet> {

   typedef unroller<Val-1, BitSet> base_t;

   typedef u_int8_t element_type;


   template<class T>

   constexpr static element_type result(T num) noexcept(true) {

      return ((num & BitSet::value[Val]) ? 1 : 0 ) + base_t::result(num);

   }

};

template<class BitSet>

struct unroller<0, BitSet> {

   typedef u_int8_t element_type;


   template<class T>

   constexpr static element_type result(T) noexcept(true) {

      return 0;

   }

};


}


/// Count the number of set bits in the input number.

/**

   Because the operator>>() is poorly defined this only works for unsigned types. This is because there may be a sign bit or two's complement representation of the negative number. Then shifting might cause the sign bit to be shifted into  the number itself, possibly causing an infinite loop.


   Complexity: run-time: O(n) where n is at most the number of bits used to represent the input type.

*/

struct count_setbits {

   typedef unsigned long long element_type;


private:

   typedef private_::gen_bitmasks<1ULL<<((sizeof(element_type)*8)-1), private_::shifter>::type bitmasks;

   typedef private_::unroller<bitmasks::value.size()-1, bitmasks> unroller_t;


public:

   /**

      A fully-unrolled loop-based implementation. This assumes that the compiler is clever enough to inline all of the recursively generated functions, and then re-roll the repeated sequence into an optimal loop for the specified architecture. But also that the processor has enough bitwise-& functional units for the re-rolled loops to contain enough overlapped bitwise-& operations to make it efficient. Moreover the compiler knows the loop bounds statically, so may be able to generate branch-prediction indicators to ensure that the loop is optimally executed.

   */

   constexpr static element_type result(element_type num) noexcept(true) {

      return unroller_t::result(num);

   }


   constexpr static float efficiency() noexcept(true) {

      return 1.0f;

   }

};


} } } }