doxygen/html/thread__pool__aspects_8hpp_source.html

#ifndef LIBJMMCG_CORE_THREAD_POOL_TRAITS_HPP

#define LIBJMMCG_CORE_THREAD_POOL_TRAITS_HPP


/******************************************************************************

** Copyright © 2004 by J.M.McGuiness, coder@hussar.me.uk

**

** This library is free software; you can redistribute it and/or

** modify it under the terms of the GNU Lesser General Public

** License as published by the Free Software Foundation; either

** version 2.1 of the License, or (at your option) any later version.

**

** This library is distributed in the hope that it will be useful,

** but WITHOUT ANY WARRANTY; without even the implied warranty of

** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

** Lesser General Public License for more details.

**

** You should have received a copy of the GNU Lesser General Public

** License along with this library; if not, write to the Free Software

** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

*/


#include "batch.hpp"

#include "intrusive.hpp"

#include "priority_queue.hpp"

#include "thread_os_traits.hpp"

#include "thread_statistics.hpp"


#include <array>

#include <queue>


namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace ppd {


   namespace lock {


      template<

         class St,

         St UnSig,

         St Pri

      >

      class new_event_signal<St, UnSig, Pri, api_lock_traits<platform_api, sequential_mode>::critical_section_type> final : public api_lock_traits<platform_api, sequential_mode>::critical_section_type::lock_traits {

      public:

         using base_t=typename api_lock_traits<platform_api, sequential_mode>::critical_section_type::lock_traits;

         using lock_traits=base_t;

         using atomic_state_type=typename lock_traits::atomic_state_type;

         using anon_semaphore_type=typename lock_traits::anon_semaphore_type;

         using count_type=typename anon_semaphore_type::count_type;

         using locker_type=api_lock_traits<platform_api, sequential_mode>::critical_section_type;

         using write_lock_type=typename locker_type::write_lock_type;

         using read_lock_type=typename locker_type::read_lock_type;

         using atomic_t=anon_semaphore_type;

         using states=St;

         using lock_result_type=typename std::pair<states, atomic_state_type>;

         static constexpr states unsignalled=UnSig;

         static constexpr states priority=Pri;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr generic_traits::memory_access_modes memory_access_mode=generic_traits::memory_access_modes::crew_memory_access;


      private:

         atomic_t semaphore;


      public:


         BOOST_MPL_ASSERT((std::is_same<typename write_lock_type::atomic_t, locker_type>));


         constexpr new_event_signal() noexcept(true) FORCE_INLINE

         : semaphore(lock_traits::atom_unset), state_(unsignalled) {

         }

         new_event_signal(new_event_signal const &)=delete;


         static locker_type &__fastcall locker() noexcept(true) FORCE_INLINE {

            static locker_type lk;

            return lk;

         }

         atomic_state_type __fastcall set_nolk(states const s) noexcept(noexcept(semaphore.set())) FORCE_INLINE {

            if (state_!=priority) {

               state_=s;

            }

            return semaphore.set();

         }

         atomic_state_type __fastcall set(states const s) noexcept(false) FORCE_INLINE {

            return set_nolk(s);

         }

         atomic_state_type __fastcall reset() noexcept(true) FORCE_INLINE {

            const atomic_state_type ret=semaphore.reset();

            if (ret==lock_traits::atom_unset) {

               state_=unsignalled;

            }

            return ret;

         }

         void clear() noexcept(noexcept(semaphore.try_lock())) FORCE_INLINE {

            while (semaphore.try_lock()==lock_traits::atom_set);

            state_=unsignalled;

         }


         lock_result_type __fastcall lock() noexcept(noexcept(semaphore.lock())) FORCE_INLINE {

            return lock_result_type(state_, semaphore.lock());

         }

         lock_result_type __fastcall lock(const typename lock_traits::timeout_type t) noexcept(noexcept(semaphore.lock(t))) FORCE_INLINE {

            return lock_result_type(state_, semaphore.lock(t));

         }

         lock_result_type __fastcall try_lock() noexcept(noexcept(semaphore.try_lock())) FORCE_INLINE {

            return lock_result_type(state_, semaphore.try_lock());

         }

         lock_result_type __fastcall unlock() noexcept(noexcept(semaphore.unlock())) FORCE_INLINE {

            return lock_result_type(state_, semaphore.unlock());

         }

         constexpr count_type __fastcall count() const noexcept(noexcept(semaphore.count())) FORCE_INLINE {

            return semaphore.count();

         }


      private:

         states state_;

      };


   }


   namespace private_ {


      template<class T>

      struct no_op final {};


      template<template<class> class T,class V>

      struct def_key_compare final {

         typedef T<V> result;

      };

      template<class V>

      struct def_key_compare<no_op,V> final {

         typedef std::less<V> result;

      };


   }


   /// A namespace to hold various traits relating to selecting the specific specialisation of thread_pool they would like.

   /**

      These parameters allow the users to choose a thread_pool that has the properties they would like. Note that not all combinations make sense, so will give compilation errors, as they are not implemented, and of the rest, some of those may not be implemented. Contact the author if you need a specific specialisation!

   */

   namespace pool_traits {


      namespace private_ {


         template<class QM, class EvSts, class CST, template<class> class Stats, class Cont, unsigned long GSSk>

         struct thread_pool_queue_details_back_batch_queue;

         template<class EvSts, class CST, template<class> class Stats, class Cont, unsigned long GSSk>

         struct thread_pool_queue_details_back_batch_queue<pool_traits::work_distribution_mode_t::queue_model_t::pool_owns_queue, EvSts, CST, Stats, Cont, GSSk> {

            using queue_model=pool_traits::work_distribution_mode_t::queue_model_t::pool_owns_queue;


            /// To be used by the thread_pool to signal that it requires the pool_threads it manages should exit, or have closure_base-derived closure to process.

            /**

               This is modelling the functionality of "WaitForMultipleObjects(...)" in the Win32 API.

            */

            using exit_requested_type=typename lock::new_event_signal<

               EvSts,

               EvSts::unsignalled,

               EvSts::exit_requested,

               CST

            >;

            using lock_all_type=typename lock::any_order::all<

               exit_requested_type::lock_traits::api_type,

               typename exit_requested_type::lock_traits::model_type,

               typename exit_requested_type::locker_type,

               typename exit_requested_type::locker_type

            >;

            using element_type=typename Cont::value_type;

            using value_ret_type=std::array<element_type, GSSk>;  ///< Return a container of items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.

            /// The signalled_work_queue_type is an adapted list to add some thread-safety, because we need to perform multiple operations on it, atomically.

            /**

               Basically this has been defined as a FIFO or LIFO queue that has locking semantics defined by the specific collection used, and implements a simple list-based scheduling algorithm that assumes that each work-item is the same work-load.


               \todo JMG: if the work-load has a user-supplied estimate of the time it would take to complete, then we could implement a more sophisticated and better algorithm, e.g. in [1].


               The algorithmic complexity of the operations on this container is based upon the complexity of the operations on the underlying container, in this case an instrusive::slist.

               This specifically implies that adding and removing an item takes constant time.


               This queue would simply implement a bakers' scheduling algorithm: i.e. the active pool_thread would remove k items from the front of the queue to be mutated by that thread.


               1. The items in the queue would be a list of work items. i.e. V should be a container.

               2. When an item is added to the queue, it would be appended to the last item's queue, leaving some distance between the last & first item, e.g. the number of threads in the pool, to try and load-balance the work-load.

               3. The pool_threads would loop through the list of items that they extract to perform the mutations.

               4. The items in the queue would be a list of work items.

               5. The pool_threads would remove at most GSSk work-items from the queue and loop through the list of items that it extracted to perform those mutations.


               \see back_batch

               \see queue, funky_queue

               \see intrusive::slist


               [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.

            */

            using container_type=back_batch<

// TODO testing stability & performance:           funky_queue<

               queue<

                  Cont,

                  typename exit_requested_type::locker_type,

                  typename exit_requested_type::locker_type::write_lock_type, ///< See: EREW locking used in the signalled_work_queue_type.

                  signalling<exit_requested_type>,

                  value_ret_type,

                  lock_all_type

               >, ///< Also try funky_queue as an alternative queue implementation.

               GSSk

            >;

            using statistics_type=Stats<typename container_type::size_type>;

            using have_work_type=typename container_type::have_work_type;


            /**

               To assist in allowing compile-time computation of the algorithmic order of the threading model.

            */

            static constexpr generic_traits::memory_access_modes memory_access_mode=container_type::memory_access_mode;


            /// This is one exit event for all of the threads in the pool: so that they can exit in parallel.

            mutable exit_requested_type exit_requested_;

            /**

               This implies that the thread_pool assumes a flat memory model, shared across all processors. This may not reflect reality well, in the face of caches.

            */

            container_type signalled_work_queue;


            thread_pool_queue_details_back_batch_queue() FORCE_INLINE

            : exit_requested_(), signalled_work_queue(exit_requested_) {

            }

         };

         template<pool_traits::work_distribution_mode_t::queue_model_t::stealing_mode_t SM, class EvSts, class CST, template<class> class Stats, class Cont, unsigned long GSSk>

         struct thread_pool_queue_details_back_batch_queue<pool_traits::work_distribution_mode_t::queue_model_t::thread_owns_queue<SM>, EvSts, CST, Stats, Cont, GSSk> {

            using queue_model=pool_traits::work_distribution_mode_t::queue_model_t::thread_owns_queue<SM>;

            /// To be used by the thread_pool to signal that it requires the pool_threads it manages should exit, or have closure_base-derived closure to process.

            /**

               Because we spin in the thread we can just use a boolean flag to signal exiting.

            */

            using exit_requested_type=typename lock::new_event_signal<

               EvSts,

               EvSts::unsignalled,

               EvSts::exit_requested,

               CST

            >;

            using lock_all_type=typename lock::any_order::all<

               exit_requested_type::lock_traits::api_type,

               typename exit_requested_type::lock_traits::model_type,

               typename exit_requested_type::locker_type,

               typename exit_requested_type::locker_type

            >;

            using element_type=typename Cont::value_type;


            /// GSS(k) batching is not supported.

            BOOST_MPL_ASSERT((std::is_same<std::integral_constant<unsigned long, GSSk>, std::integral_constant<unsigned long, 1UL>>));


            using value_ret_type=std::array<element_type, GSSk>;  ///< Return one item from the front of the queue, GSS(k) or bakers' scheduling algorithm is not supported.

            /// The signalled_work_queue_type is a lock-free list, because we need to perform multiple operations on it, atomically.

            /**

               Basically this has been defined as a LIFO or FIFO queue that lock-free semantics defined by the specific collection used, and implements a simple list-based scheduling algorithm that assumes that each work-item is the same work-load.


               \todo JMG: if the work-load has a user-supplied estimate of the time it would take to complete, then we could implement a more sophisticated and better algorithm, e.g. in [1].


               The algorithmic complexity of the operations on this container is based upon the complexity of the operations on the underlying container, in this case an instrusive::slist.

               This specifically implies that adding and removing an item takes constant time.


               \see intrusive::slist


               [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.

            */

            using container_type=Cont;

            using statistics_type=Stats<typename container_type::size_type>;

            using have_work_type=no_signalling<exit_requested_type>; ///< We're going to busy-wait for new input_work in the pool_threads.


            /**

               To assist in allowing compile-time computation of the algorithmic order of the threading model.

            */

            static constexpr generic_traits::memory_access_modes memory_access_mode=container_type::memory_access_mode;


            /// This is one exit event for all of the threads in the pool: so that they can exit in parallel.

            mutable exit_requested_type exit_requested_;


            thread_pool_queue_details_back_batch_queue() FORCE_INLINE

            : exit_requested_() {

            }

         };


         template<class QM, class EvSts, template<class> class Stats, class V, template<class> class Comp, unsigned long GSSk>

         struct thread_pool_queue_details_front_batch_priority_queue;

         template<class EvSts, template<class> class Stats, class V, template<class> class Comp, unsigned long GSSk>

         struct thread_pool_queue_details_front_batch_priority_queue<pool_traits::work_distribution_mode_t::queue_model_t::pool_owns_queue, EvSts, Stats, V, Comp, GSSk> {

            using queue_model=pool_traits::work_distribution_mode_t::queue_model_t::pool_owns_queue;

            /**

               We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.

            */

            using element_type=shared_ptr<

               V,

               typename V::base_t::lock_traits

            >;

            /// To be used by the thread_pool to signal that it requires the pool_threads it manages should exit, or have closure_base-derived closure to process.

            /**

               This is modelling the functionality of "WaitForMultipleObjects(...)" in the Win32 API.

            */

            using exit_requested_type=typename lock::new_event_signal<

               EvSts,

               EvSts::unsignalled,

               EvSts::exit_requested,

               typename element_type::lock_traits::critical_section_type   ///< The underlying lock-type to use to lock the signalled_work_queue_type, which will be locked EREW style.

            >;

            using lock_all_type=typename lock::any_order::all<

               exit_requested_type::lock_traits::api_type,

               typename exit_requested_type::lock_traits::model_type,

               typename exit_requested_type::locker_type,

               typename exit_requested_type::locker_type

            >;

            using key_compare=typename ppd::private_::def_key_compare<Comp, element_type>::result;

            /// The signalled_work_queue_type is an adapted list to add some thread-safety, because we need to perform multiple operations on it, atomically.

            /**

               Basically this has been defined as a FIFO queue that has locking semantics defined by the specific collection used, and implements a simple list-based scheduling algorithm that assumes that each work-item is the same work-load.


               \todo JMG: if the work-load has a user-supplied estimate of the time it would take to complete, then we could implement a more sophisticated and better algorithm, e.g. in [1].


               The algorithmic complexity of the operations on this container is based upon the complexity of the operations on the underlying container, in this case an instrusive::slist.

               This specifically implies that adding and removing an item takes constant time.


               This queue would simply implement a bakers' scheduling algorithm: i.e. the active pool_thread would remove k items from the front of the queue to be mutated by that thread.


               1. The items in the queue would be a list of work items. i.e. V should be a container.

               2. When an item is added to the queue, it would be appended to the last item's queue, leaving some distance between the last & first item, e.g. the number of threads in the pool, to try and load-balance the work-load.

               3. The pool_threads would loop through the list of items that they extract to perform the mutations.

               4. The items in the queue would be a list of work items.

               5. The pool_threads would remove at most GSSk work-items from the queue and loop through the list of items that it extracted to perform those mutations.


               \see back_batch

               \see queue, funky_queue

               \see intrusive::slist


               [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.

            */

            using container_type=front_batch<

               priority_queue<

                  std::priority_queue<element_type, std::vector<element_type>, key_compare>,

                  typename exit_requested_type::locker_type,

                  typename exit_requested_type::locker_type::write_lock_type, ///< See: EREW locking used in the signalled_work_queue_type.

                  signalling<exit_requested_type>,

                  std::array<element_type, GSSk>   ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm, note that these items are returned in the specified priority order, but once removed, it a higher-priority item is added, the removed work will not be modified.

               >,

               GSSk

            >;

            using statistics_type=Stats<typename container_type::size_type>;

            using have_work_type=typename container_type::have_work_type;


            /**

               To assist in allowing compile-time computation of the algorithmic order of the threading model.

            */

            static constexpr generic_traits::memory_access_modes memory_access_mode=container_type::memory_access_mode;


            /// This is one exit event for all of the threads in the pool: so that they can exit in parallel.

            mutable exit_requested_type exit_requested_;

            /**

               This implies that the thread_pool assumes a flat memory model, shared across all processors. This may not reflect reality well, in the face of caches.

            */

            container_type signalled_work_queue;


            thread_pool_queue_details_front_batch_priority_queue() FORCE_INLINE

            : exit_requested_(), signalled_work_queue(exit_requested_) {

            }

         };


         template<class TPQD, unsigned long GSSk>

         struct pool_thread_queue_details;

         /// The pool_threads share a signalled_work_queue in the thread_pool.

         template<template<class> class TPQD, unsigned long GSSk>

         struct pool_thread_queue_details<TPQD<pool_traits::work_distribution_mode_t::queue_model_t::pool_owns_queue>, GSSk> {

            using queue_model=pool_traits::work_distribution_mode_t::queue_model_t::pool_owns_queue;

            using thread_pool_queue_details=TPQD<queue_model>;

            using container_type=typename thread_pool_queue_details::container_type;

            using os_traits=typename container_type::value_type::value_type::os_traits;

            using statistics_type=typename thread_pool_queue_details::statistics_type;

            using batch_type=ppd::private_::batch_details<GSSk, container_type, statistics_type>;

            /// A resource-efficient event: suspends the waiting thread, and wakes it when the input_work has been processed..

            using work_complete_t=typename os_traits::lock_traits::anon_event_type;

            using exit_requested_type=typename thread_pool_queue_details::exit_requested_type;

            using have_work_type=typename thread_pool_queue_details::have_work_type;


            /**

               To assist in allowing compile-time computation of the algorithmic order of the threading model.

            */

            static constexpr generic_traits::memory_access_modes memory_access_mode=container_type::memory_access_mode;


            /**

               This is the batch that only this thread will process, so it does not need to be thread-safe.

            */

            batch_type batch;

            container_type &signalled_work_queue;


            explicit pool_thread_queue_details(container_type &q) noexcept(true) FORCE_INLINE

            : signalled_work_queue(q) {

            }


            statistics_type const &statistics() const noexcept(true) FORCE_INLINE {

               return batch.statistics();

            }

            statistics_type &statistics() noexcept(true) FORCE_INLINE {

               return batch.statistics();

            }

         };

         /// The pool_threads own a signalled_work_queue each, which must be thread-safe, possibly lock-free.

         /**

            This implies that the thread_pool assumes a flat memory model, shared across all processors. This may not reflect reality well, in the face of caches.

            This idea originated in discussions with Colin Egan, this could be instantiated with a lock-free lifo queue (to keep the cache hot) per pool_thread, and allow the pool_threads to steal from one another's queues, but only take from the tail, not the head, so that stolen work should be cold in the cache of the associated pool_thread. Priorities are not supported.

            According to [1], having a queue per pool_thread with work-stealing between those queues may be more optimal for finer-grained input_work, as long as identifying the queue with most work is efficient.


            [1] A. Bhattacharjee et al in  "Parallelization Libraries: Characterizing and Reducing Overheads" DOI 10.1145/1952998.1953003

         */

         template<template<class> class TPQD, pool_traits::work_distribution_mode_t::queue_model_t::stealing_mode_t SM, unsigned long GSSk>

         struct pool_thread_queue_details<TPQD<pool_traits::work_distribution_mode_t::queue_model_t::thread_owns_queue<SM>>, GSSk> {

            using queue_model=pool_traits::work_distribution_mode_t::queue_model_t::thread_owns_queue<SM>;

            using thread_pool_queue_details=TPQD<queue_model>;

            using container_type=typename thread_pool_queue_details::container_type;

            using statistics_type=typename thread_pool_queue_details::statistics_type;

            /// GSS(k) batching is not supported, as there is a queue per thread, is owned by the that thread, so very unlikely to gain by batching, only increased complexity in the code.

            BOOST_MPL_ASSERT((std::is_same<std::integral_constant<unsigned long, GSSk>, std::integral_constant<unsigned long, 1UL>>));

            using os_traits=typename container_type::value_type::value_type::os_traits;

            using no_ref_counting=typename container_type::value_type::no_ref_counting;

            using cfg_type=typename container_type::value_type::value_type::cfg_type;

// TODO           using work_complete_t=typename os_traits::lock_traits::anon_event_type;

            using work_complete_t=typename os_traits::lock_traits::anon_spin_event_type;

            using exit_requested_type=typename thread_pool_queue_details::exit_requested_type;

            using have_work_type=typename thread_pool_queue_details::have_work_type;


            /**

               To assist in allowing compile-time computation of the algorithmic order of the threading model.

            */

            static constexpr generic_traits::memory_access_modes memory_access_mode=container_type::memory_access_mode;


            /**

               This is the batch that only this thread will process, so it does not need to be thread-safe.

               The plan is to spin waiting for work in this queue, waiting for work to be added.

            */

            container_type batch;

            statistics_type statistics_;


            pool_thread_queue_details() noexcept(true) FORCE_INLINE {

            }

            pool_thread_queue_details(pool_thread_queue_details &&p) noexcept(true) FORCE_INLINE

            : batch(std::move(p.batch)), statistics_(p.statistics_) {

            }

            explicit pool_thread_queue_details(container_type &c) noexcept(true) FORCE_INLINE

            : batch(c) {

            }


            statistics_type const &statistics() const noexcept(true) FORCE_INLINE {

               return statistics_;

            }

            statistics_type &statistics() noexcept(true) FORCE_INLINE {

               return statistics_;

            }

         };

      }


      /// The signalled_work_queue_type within the thread_pool will obey strict FIFO semantics.

      template<

         class V,

         template<class> class Comp,   ///< The comparator functor is ignored in this case, as this is a fifo signalled_work_queue_type.

         class EvSts,   ///< The states the event-type that is used to signal that the signalled_work_queue_type has work may take.

         unsigned long GSSk,  ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.

         template<class> class Stats

      >

      struct normal_fifo {

         /**

            We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.

         */

         using internal_container=intrusive::slist<

            V,

            typename V::base_t::lock_traits

         >;

         using key_compare=ppd::private_::no_op<typename internal_container::value_type>; ///< This is a FIFO signalled_work_queue_type, so no ordering.

         static constexpr priority_mode_t priority_mode=priority_mode_t::normal;

         template<class QM>

         struct thread_pool_queue_details : private_::thread_pool_queue_details_back_batch_queue<

            QM,

            EvSts,

            typename internal_container::value_type::lock_traits::critical_section_type,  ///< The underlying lock-type to use to lock the signalled_work_queue_type, which will be locked EREW style.

            Stats,

            internal_container,

            GSSk

         > {};

         template<class QM>

         using pool_thread_queue_details=private_::pool_thread_queue_details<thread_pool_queue_details<QM>, GSSk>;

      };


      /// The signalled_work_queue_type within the thread_pool will obey strict LIFO semantics.

      template<

         class V,

         template<class> class Comp,   ///< The comparator functor is ignored in this case, as this is a lifo signalled_work_queue_type.

         class EvSts,   ///< The states the event-type that is used to signal that the signalled_work_queue_type has work may take.

         unsigned long GSSk,  ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.

         template<class> class Stats

      >

      struct normal_lifo {

         /**

            We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.

         */

         using internal_container=intrusive::stack<

            V,

            typename V::base_t::lock_traits

         >;

         using key_compare=ppd::private_::no_op<typename internal_container::value_type>; ///< This is a LIFO signalled_work_queue_type, so no ordering.

         static constexpr priority_mode_t priority_mode=priority_mode_t::normal;

         /// Adapt the stack to look like a list.

         struct adaptor : public internal_container {

            using typename internal_container::value_type;


            value_type __fastcall front() noexcept(true) FORCE_INLINE {

               return this->top();

            }

            value_type const __fastcall front() const noexcept(true) FORCE_INLINE {

               return this->top();

            }

            void __fastcall push_back(value_type const &v) noexcept(true) FORCE_INLINE {

               this->push(v);

            }

            void __fastcall push_back(value_type &&v) noexcept(true) FORCE_INLINE {

               this->push(std::forward<value_type>(v));

            }

         };

         template<class QM>

         struct thread_pool_queue_details : private_::thread_pool_queue_details_back_batch_queue<

            QM,

            EvSts,

            typename internal_container::value_type::lock_traits::critical_section_type,  ///< The underlying lock-type to use to lock the signalled_work_queue_type, which will be locked EREW style.

            Stats,

            /**

               We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.

            */

            adaptor,

            GSSk

         > {};

         template<class QM>

         using pool_thread_queue_details=private_::pool_thread_queue_details<thread_pool_queue_details<QM>, GSSk>;

      };


      /// The signalled_work_queue_type within the thread_pool will obey strict LIFO semantics.

      template<

         class V,

         template<class> class Comp,   ///< The comparator functor is ignored in this case, as this is a lifo signalled_work_queue_type.

         class EvSts,   ///< The states the event-type that is used to signal that the signalled_work_queue_type has work may take.

         unsigned long GSSk,  ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.

         template<class> class Stats

      >

      struct normal_lifo_lockfree {

         /**

            We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.

         */

         using internal_container=intrusive::stack<

            V,

            typename V::base_t::lock_traits

         >;

         using key_compare=ppd::private_::no_op<typename internal_container::value_type>; ///< This is a LIFO signalled_work_queue_type, so no ordering.

         using lock_traits=typename internal_container::value_type::lock_traits;

         using os_traits=typename internal_container::value_type::value_type::os_traits;

         using thread_traits=typename os_traits::thread_traits;

         static constexpr priority_mode_t priority_mode=priority_mode_t::normal;

         /// Adapt the stack to look like a list.

         struct lockfree_to_safe_colln : public internal_container {

            using typename internal_container::value_type;

            using value_ret_type=internal_container;


            static constexpr unsigned long max_size=1UL;


            lockfree_to_safe_colln() noexcept(true) FORCE_INLINE

            : internal_container() {}

            lockfree_to_safe_colln(lockfree_to_safe_colln &&a) noexcept(true) FORCE_INLINE

            : internal_container(std::forward<lockfree_to_safe_colln>(a)) {}

            value_type __fastcall front() noexcept(true) FORCE_INLINE {

               return this->top();

            }

            value_type const __fastcall front() const noexcept(true) FORCE_INLINE {

               return this->top();

            }

            void __fastcall push_back(value_type const &v) noexcept(true) FORCE_INLINE {

               this->push(v);

            }

            void __fastcall push_back(value_type &&v) noexcept(true) FORCE_INLINE {

               this->push(std::forward<value_type>(v));

            }

            value_type pop_front_1_nochk_nosig() noexcept(true) FORCE_INLINE {

               return this->pop_top_nochk();

            }

         };

         template<class QM>

         struct thread_pool_queue_details : private_::thread_pool_queue_details_back_batch_queue<

            QM,

            EvSts,

            api_lock_traits<platform_api, sequential_mode>::critical_section_type,  ///< The container is lockfree, and we'll spin for input_work.

            Stats,

            lockfree_to_safe_colln,

            GSSk

         > {};

         template<class QM>

         using pool_thread_queue_details=private_::pool_thread_queue_details<thread_pool_queue_details<QM>, GSSk>;

      };


      /// The signalled_work_queue_type within the thread_pool will operate upon work in some user-defined partial order.

      template<

         class V,

         template<class> class Comp,   ///< The comparator functor that will be used to implement the partial ordering, which implies the priority.

         class EvSts,   ///< The states the event-type that is used to signal that the signalled_work_queue_type has work may take.

         unsigned long GSSk,  ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.

         template<class> class Stats

      >

      struct prioritised_queue {

         static constexpr priority_mode_t priority_mode=priority_mode_t::priority;

         template<class QM>

         struct thread_pool_queue_details : private_::thread_pool_queue_details_front_batch_priority_queue<

            QM,

            EvSts,

            Stats,

            V,

            Comp,

            GSSk

         > {};

         template<class QM>

         using pool_thread_queue_details=private_::pool_thread_queue_details<thread_pool_queue_details<QM>, GSSk>;

      };


      /// The states in which the signalled_work_queue_type can be.

      enum class states {

         unsignalled,

         exit_requested,

         new_work_arrived

      };

   }


   /// The fundamental way to specify the type of thread_pool that is required.

   /**

      This class is used to encapsulate the OS-specific threading traits, atomic locks, etc for the thread_pool and other dependent classes.

   */

   template<

      generic_traits::return_data RD,  ///< If the thread_pool should implement returning data from the mutated work. i.e. support execution_contexts. i.e. model dataflow.

      generic_traits::api_type API, ///< The API that should be used, e.g. Win32, posix_pthreads, platform_api, etc.

      typename Mdl,  ///< The threading model, e.g. heavyweight_threads, sequential_mode, etc.

      template<class, template<class> class, class, unsigned long, template<class> class> class PM,   ///< Allows the user to specify the if the thread pool should support prioritization of the work in its input queue, e.g. normal_fifo or prioritised_queue.

      template<class> class Comp=private_::no_op,  ///< The optional comparison operator to use to specify how the partial ordering for the priority of the work. Note that this may restrict the types of work that may be transferred into the thread pool. If a prioritised_queue is chosen, then by default this will be std::less<value_type>. This parameter looks complex because of the declaration of value_type, which is done in this class, so the user does not have access to that opaque type until after this class is declared. Another easier technique the use may use is to define a weak order on their input work which closure_base_t will make use of, and the user may ignore this parameter.

      unsigned long GSSkSz=1,

      template<class> class Stats=no_statistics,

      template<class> class CFG=no_control_flow_graph

   >

   struct pool_aspects final {

      /// If the thread_pool should implement returning data from the mutated work. i.e. support execution_contexts.

      static constexpr generic_traits::return_data result_traits_=RD;

      /// The k-size for the batches to implement GSS(k) scheduling, if >1, then this is effectively the baker's ticket scheduling scheme.

      /**

         The size of the batch to be taken in the GSS(k) or bakers' scheduling algorithm. Note that the is what I term as "front_batch"ing: when the tasks extracted from the signalled_work_queue_type in the thread_pool, as opposed to adding to the thread_pool. A value of zero is not allowed. Note that with an average optimizing compiler, there should be no performance loss for a batch-size of one, and higher batch sizes should simply result in reduced contention on the signalled_work_queue_type within the thread_pool. A template parameter is used so that the implementation can allocate on the stack a fixed-size array of tasks, so avoiding calling the memory manager, reducing locks, the converse would defeat the point of GSS(k) scheduling, which is to reduce lock contention!


         If the GSSk>1 and the first closure_base-derived closure depends upon a later job to complete (with a dependency that is not managed by execution_context's, i.e. a back-edge in the control dependency graph, i.e. not a strictly nested dependency), then that sub-tree of dependent closure_base will deadlock. This is because the processing loop in pool_thread::process() will wait for the first closure_base to complete, which depends upon the second (or later in the batch) closure_base which will not be executed as the earlier closure_base is preventing this loop for continuing. Therefore one must ensure that for GGSk>1, the dependency tree of the closure_base has been carefully constructed. If all is well in sequential_mode, yet fails with GSSk>1 using platform_api, try with GSSk=1, and this will be your issue.


         \see batch_details::process_a_batch()

      */

      static constexpr unsigned long GSSk=GSSkSz;


      /// The all-important os-traits: used to obtain not only the threading model_traits and generic_traits which provide the abstraction to the underlying threading implementation in the api_threading_traits, but also the api_type, and therefore the api_lock_traits which contains the atomic locks and atomic counters used. So: rather important.

      typedef thread_os_traits<API, Mdl> os_traits;

      typedef CFG<os_traits> cfg_type;


      template<class V> using atomic_wrapper_t=typename os_traits::lock_traits::template atomic_counter_type<V>;


      /// Some typedefs used as short-hands.

      using async_thread_wk_elem_type=typename private_::closure::thread_wk_async_t<result_traits_, os_traits, default_delete, atomic_wrapper_t, cfg_type>;

      using thread_wk_elem_type=typename async_thread_wk_elem_type::base_t;


      /// Some classes used as short-hands.

      template<generic_traits::return_data RD1, class ThrW, class WFlg, template<class> class Del, template<class> class AtCtr>

      struct thread_wk;

      /// Some classes used as short-hands.

      template<class ThrW, class WFlg, template<class> class Del, template<class> class AtCtr>

      struct thread_wk<generic_traits::return_data::joinable, ThrW, WFlg, Del, AtCtr> final : public private_::closure::thread_wk<generic_traits::return_data::joinable, os_traits, ThrW, WFlg, Del, AtCtr, cfg_type> {

         typedef typename private_::closure::thread_wk<generic_traits::return_data::joinable, os_traits, ThrW, WFlg, Del, AtCtr, cfg_type> base_t;

         typedef typename base_t::closure_t closure_t;

         typedef typename base_t::work_complete_t work_complete_t;

         typedef typename base_t::cfg_details_type cfg_details_type;

         typedef Del<thread_wk> deleter_t;


         thread_wk(work_complete_t &w, typename closure_t::argument_type &&tw, typename cfg_details_type::params const &p) FORCE_INLINE

         : base_t(w, std::forward<typename closure_t::argument_type>(tw), p) {

         }

         __stdcall ~thread_wk() noexcept(true) FORCE_INLINE {}

      };

      /// Some classes used as short-hands.

      template<class ThrW, class WFlg, template<class> class Del, template<class> class AtCtr>

      struct thread_wk<generic_traits::return_data::nonjoinable, ThrW, WFlg, Del, AtCtr> final : public private_::closure::thread_wk<generic_traits::return_data::nonjoinable, os_traits, ThrW, WFlg, Del, AtCtr, cfg_type> {

         typedef typename private_::closure::thread_wk<generic_traits::return_data::nonjoinable, os_traits, ThrW, WFlg, Del, AtCtr, cfg_type> base_t;

         typedef typename base_t::closure_t closure_t;

         typedef typename base_t::work_complete_t work_complete_t;

         typedef typename base_t::cfg_details_type cfg_details_type;

         typedef Del<thread_wk> deleter_t;


         thread_wk(typename closure_t::argument_type &&tw, typename cfg_details_type::params const &p) FORCE_INLINE

         : base_t(std::forward<typename closure_t::argument_type>(tw), p) {

         }


      private:

         friend deleter_t;


         __stdcall ~thread_wk() noexcept(true) FORCE_INLINE {}

      };

      /// Some classes used as short-hands.

      template<class ThrW, class WFlg, template<class> class Del, template<class> class AtCtr>

      struct algo_thread_wk final : public private_::closure::algo_thread_wk<os_traits, ThrW, WFlg, Del, AtCtr, cfg_type> {

         typedef typename private_::closure::algo_thread_wk<os_traits, ThrW, WFlg, Del, AtCtr, cfg_type> base_t;

         typedef typename base_t::closure_t closure_t;

         typedef typename base_t::work_complete_t work_complete_t;

         typedef typename base_t::cfg_details_type cfg_details_type;


         algo_thread_wk(work_complete_t &w, typename closure_t::argument_type &&tw, typename cfg_details_type::params const &p) FORCE_INLINE

         : base_t(w, std::forward<typename closure_t::argument_type>(tw), p) {

         }

         ~algo_thread_wk() noexcept(true) FORCE_INLINE {

         }

      };

      /// Some classes used as short-hands.

      template<class ThrW, class WFlg, class SubDivAlgWk, template<class> class Del=default_delete, template<class> class AtCtr=atomic_wrapper_t>

      struct algo_thread_wk_buffered final : public private_::closure::algo_thread_wk_buffered<os_traits, ThrW, WFlg, SubDivAlgWk, Del, AtCtr, cfg_type> {

         typedef typename private_::closure::algo_thread_wk_buffered<os_traits, ThrW, WFlg, SubDivAlgWk, Del, AtCtr, cfg_type> base_t;

         typedef typename base_t::closure_t closure_t;

         typedef typename base_t::work_complete_t work_complete_t;

         typedef typename base_t::algo_work_heap_type algo_work_heap_type;

         typedef typename base_t::cfg_details_type cfg_details_type;


         algo_thread_wk_buffered(work_complete_t &w, typename closure_t::argument_type &&tw, typename algo_work_heap_type::size_type const num_objs, typename cfg_details_type::params const &p) FORCE_INLINE

         : base_t(w, std::forward<typename closure_t::argument_type>(tw), num_objs, p) {

         }

         ~algo_thread_wk_buffered() noexcept(true) FORCE_INLINE {

         }

      };


   private:

      /// Some typedefs used as short-hands.

      typedef PM<

         thread_wk_elem_type,

         Comp,

         pool_traits::states,

         GSSk,

         Stats

      > queue_t;


   public:

      /// The specific signalled_work_queue_type to be used in the thread_pool.

      /**

         This class should combine a container with an atomic event. The event should be set when there are items in the queue and reset when the container becomes empty. This would allow threads to atomically wait upon the container for work to be added to it.


         \todo Colin Egan suggested that one could consider the asynchronous work transferred into this queue as a set of instructions. (The ISA being generated by the program being compiled, composed of the unique closure_base-derived closure types transferred.) One could then analyse these instructions as sets of basic-blocks, and apply analysis to those basic blocks for code-hoisting, consider trace-scheduling, etc, etc.

      */

      template<class QM>

      using thread_pool_queue_details=typename queue_t::template thread_pool_queue_details<QM>;

      template<class QM>

      using signalled_work_queue_type=typename thread_pool_queue_details<QM>::container_type;

      template<class QM>

      using exit_requested_type=typename thread_pool_queue_details<QM>::exit_requested_type;

      template<class QM>

      using have_work_type=typename thread_pool_queue_details<QM>::have_work_type;

      template<class QM>

      using pool_thread_queue_details=typename queue_t::template pool_thread_queue_details<QM>;

      /**

         Note that the parameter to Stats is not atomic, implying that performance over accuracy is preferred.  This is by design: performance over accuracy has been preferred and locking reduces performance, and this library has been designed to be fast, so the statistics gathering is consequently less accurate, in particular may be under-estimates. This isn't as bad as it first appears as most SMP architectures implement some form of cache-coherency protocol (e.g. MESI or MOESI) that can correct some of these inaccuracies.


         A consequence of this is that 'valgrind --tool=helgrind' will report potential race-conditions if, for example basic_statistics, is used. This is not a problem. For speed basic_statistics does not add any locking so the race-conditions are to be expected. Please ignore those warnings, or use the no_statistics class instead.


         \see no_statistics

         \see basic_statistics

      */

      template<class QM>

      using statistics_type=typename pool_thread_queue_details<QM>::statistics_type;


      /// An accessor for getting at the priority mode that the thread_pool may support.

      static constexpr pool_traits::priority_mode_t priority_mode=queue_t::priority_mode;

   };


   template<

      class DM,

      pool_traits::size_mode_t Ps,

      class P

   >

   class thread_pool;


} } }


#endif