doxygen/html/thread__dsel__types_8hpp_source.html

#ifndef LIBJMMCG_CORE_PRIVATE_THREAD_DSEL_TYPES_HPP

#define LIBJMMCG_CORE_PRIVATE_THREAD_DSEL_TYPES_HPP


/******************************************************************************

** Copyright © 2010 by J.M.McGuiness, coder@hussar.me.uk

**

** This library is free software; you can redistribute it and/or

** modify it under the terms of the GNU Lesser General Public

** License as published by the Free Software Foundation; either

** version 2.1 of the License, or (at your option) any later version.

**

** This library is distributed in the hope that it will be useful,

** but WITHOUT ANY WARRANTY; without even the implied warranty of

** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

** Lesser General Public License for more details.

**

** You should have received a copy of the GNU Lesser General Public

** License along with this library; if not, write to the Free Software

** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

*/


#include "../../core/config.h"


#include <boost/bind/bind.hpp>

#include <numeric>

#include <type_traits>


namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace ppd {


   /// A modifier that enables dividing the thread_pool into a number of equal-sized cliques of pool_threads.

   /**

      It applies only to parallel algorithms (a compile-time error will result if it is used with an incompatible closure_base-derived closure type).


      \see cliques_t

   */

   struct cliques {

      typedef std::size_t element_type;


      /// The number of cliques into which the thread_pool should be divided.

      /**

         This is useful for limiting the number of tasks a parallel algorithm may automatically generate, so as to avoid swamping the thread_pool with tasks, and therefore causing excessive horizontal threading. (Horizontal threads are always dynamically generated, whereas vertical pool_threads may be precached in the thread_pool.

      */

      const element_type number;


      explicit constexpr cliques(element_type c) noexcept(true) FORCE_INLINE

      : number(c) {

      }

   };


   namespace private_ {


   /**

      \todo The input process() had two arguments, including the cfg. This closure has only one argument to process...

   */

   namespace kernel_priority {


      /// This library-internal class allows you to temporarily set the kernel-thread priority at which the work is to be run.

      /**

         Note that the original priority is always restored, even if exceptions are thrown. No hacks here, as we haven't erased the type like in set_priority_closure.


         \see priority

      */

      template<

         class TPB,

         typename TPB::thread_traits::api_params_type::priority_type Pri,

         class Wk ///< The type of the wrapped work.

      >

      class work;

      template<

         class TPB,

         typename TPB::thread_traits::api_params_type::priority_type Pri,

         class Wk

      >

      class work<TPB, Pri, void (__fastcall Wk::*)() const> : public Wk {

      public:

         typedef void result_type;

         typedef typename TPB::thread_traits thread_traits;

         typedef typename thread_traits::api_params_type::priority_type priority_type; ///< A convenience typedef for the thread-trait specific priority type.


         static constexpr priority_type priority=Pri; ///< The priority at which the wrapped work should run.


         typedef setter<thread_traits, priority> setter_type;


         /// Create the work with the appropriate priority.

         explicit __stdcall work(Wk &&wk) noexcept(noexcept(Wk(std::declval<Wk>()))) FORCE_INLINE

         : Wk(wk) {

         }


         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         void get_results() const noexcept(true) FORCE_INLINE {

            Wk::get_results();

         }

         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         void __fastcall get_results() noexcept(true) FORCE_INLINE {

            Wk::get_results();

         }


         /// An override of the process() function in the base-class that sets and restores the priority around the work.

         void __fastcall process() const FORCE_INLINE {

            const setter_type s(thread_traits::get_current_thread());

            Wk::process();

         }

      };

      template<

         class TPB,

         typename TPB::thread_traits::api_params_type::priority_type Pri,

         class Wk

      >

      class work<TPB, Pri, void (__fastcall Wk::*)()> : public Wk {

      public:

         typedef void result_type;

         typedef typename TPB::thread_traits thread_traits;

         typedef typename thread_traits::api_params_type::priority_type priority_type; ///< A convenience typedef for the thread-trait specific priority type.


         static constexpr priority_type priority=Pri; ///< The priority at which the wrapped work should run.


         typedef setter<thread_traits, priority> setter_type;


         /// Create the work with the appropriate priority.

         explicit __stdcall work(Wk &&wk) noexcept(noexcept(Wk(std::declval<Wk>()))) FORCE_INLINE

         : Wk(wk) {

         }


         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         void __fastcall get_results() const noexcept(true) FORCE_INLINE {

            Wk::get_results();

         }

         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         void __fastcall get_results() noexcept(true) FORCE_INLINE {

            Wk::get_results();

         }


         /// An override of the process() function in the base-class that sets and restores the priority around the work.

         void __fastcall process() FORCE_INLINE {

            const setter_type s(thread_traits::get_current_thread());

            Wk::process();

         }

      };

      template<

         class TPB,

         typename TPB::thread_traits::api_params_type::priority_type Pri,

         class Wk,

         class Res

      >

      class work<TPB, Pri, void (__fastcall Wk::*)(Res &) const> : public Wk {

      public:

         typedef Res result_type;

         typedef typename TPB::thread_traits thread_traits;

         typedef typename thread_traits::api_params_type::priority_type priority_type; ///< A convenience typedef for the thread-trait specific priority type.


         static constexpr priority_type priority=Pri; ///< The priority at which the wrapped work should run.


         typedef setter<thread_traits, priority> setter_type;


         /// Create the work with the appropriate priority.

         explicit __stdcall work(Wk &&wk) noexcept(noexcept(Wk(std::declval<Wk>()))) FORCE_INLINE

         : Wk(wk) {

         }


         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         const result_type & __fastcall get_results() const noexcept(true) FORCE_INLINE {

            return Wk::get_results();

         }

         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         result_type & __fastcall get_results() noexcept(true) FORCE_INLINE {

            return Wk::get_results();

         }


         /// An override of the process() function in the base-class that sets and restores the priority around the work.

         void __fastcall process(result_type &r) const FORCE_INLINE {

            const setter_type s(thread_traits::get_current_thread());

            Wk::process(r);

         }

      };

      template<

         class TPB,

         typename TPB::thread_traits::api_params_type::priority_type Pri,

         class Wk,

         class Res

      >

      class work<TPB, Pri, void (__fastcall Wk::*)(Res &)> : public Wk {

      public:

         typedef Res result_type;

         typedef typename TPB::thread_traits thread_traits;

         typedef typename thread_traits::api_params_type::priority_type priority_type; ///< A convenience typedef for the thread-trait specific priority type.


         static constexpr priority_type priority=Pri; ///< The priority at which the wrapped work should run.


         typedef setter<thread_traits, priority> setter_type;


         /// Create the work with the appropriate priority.

         explicit __stdcall work(Wk &&wk) noexcept(noexcept(Wk(std::declval<Wk>()))) FORCE_INLINE

         : Wk(wk) {

         }


         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         const result_type & __fastcall get_results() const noexcept(true) {

            return Wk::get_results();

         }

         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         result_type & __fastcall get_results() noexcept(true) FORCE_INLINE {

            return Wk::get_results();

         }


         /// An override of the process() function in the base-class that sets and restores the priority around the work.

         void __fastcall process(result_type &r) FORCE_INLINE {

            const setter_type s(thread_traits::get_current_thread());

            Wk::process(r);

         }

      };


      template<

         class TPB,

         typename TPB::thread_traits::api_params_type::priority_type Pri,

         class Wk,

         class Res,

         class CFGP

      >

      class work<TPB, Pri, void (__fastcall Wk::*)(Res &, CFGP const &) const> : public Wk {

      public:

         typedef Res result_type;

         typedef typename TPB::thread_traits thread_traits;

         typedef typename thread_traits::api_params_type::priority_type priority_type; ///< A convenience typedef for the thread-trait specific priority type.


         static constexpr priority_type priority=Pri; ///< The priority at which the wrapped work should run.


         typedef setter<thread_traits, priority> setter_type;


         /// Create the work with the appropriate priority.

         explicit __stdcall work(Wk &&wk) noexcept(noexcept(Wk(std::declval<Wk>()))) FORCE_INLINE

         : Wk(wk) {

         }


         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         const result_type & __fastcall get_results() const noexcept(true) FORCE_INLINE {

            return Wk::get_results();

         }

         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         result_type & __fastcall get_results() noexcept(true) FORCE_INLINE {

            return Wk::get_results();

         }


         /// An override of the process() function in the base-class that sets and restores the priority around the work.

         void __fastcall process(result_type &r, CFGP const &cfgp) const FORCE_INLINE {

            const setter_type s(thread_traits::get_current_thread());

            Wk::process(r, cfgp);

         }

      };

      template<

         class TPB,

         typename TPB::thread_traits::api_params_type::priority_type Pri,

         class Wk,

         class Res,

         class CFGP

      >

      class work<TPB, Pri, void (__fastcall Wk::*)(Res &, CFGP const &)> : public Wk {

      public:

         typedef Res result_type;

         typedef typename TPB::thread_traits thread_traits;

         typedef typename thread_traits::api_params_type::priority_type priority_type; ///< A convenience typedef for the thread-trait specific priority type.


         static constexpr priority_type priority=Pri; ///< The priority at which the wrapped work should run.


         typedef setter<thread_traits, priority> setter_type;


         /// Create the work with the appropriate priority.

         explicit __stdcall work(Wk &&wk) noexcept(noexcept(Wk(std::declval<Wk>()))) FORCE_INLINE

         : Wk(wk) {

         }


         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         const result_type & __fastcall get_results() const noexcept(true) FORCE_INLINE {

            return Wk::get_results();

         }

         /// Return a reference to the wrapped work.

         /**

            This function should really be unimplementable, because work wrapped by this class does not return a value. But I need this class to be concrete, because it is used.

         */

         result_type & __fastcall get_results() noexcept(true) FORCE_INLINE {

            return Wk::get_results();

         }


         /// An override of the process() function in the base-class that sets and restores the priority around the work.

         void __fastcall process(result_type &r, CFGP const &cfgp) FORCE_INLINE {

            const setter_type s(thread_traits::get_current_thread());

            Wk::process(r, cfgp);

         }

      };


   }


   /// The parallel algorithms inherits from this to assist with implementing the cliques_t language element in the DSEL.

   template<class Alg>

   struct parallel_algorithm final : public Alg {

      typedef Alg base_t;

      typedef base_t operation_type;


      explicit parallel_algorithm(operation_type &&op) noexcept(noexcept(base_t(op))) FORCE_INLINE

      : base_t(op) {}

      parallel_algorithm(parallel_algorithm const &a) noexcept(noexcept(base_t(std::declval<parallel_algorithm>()))) FORCE_INLINE

      : base_t(a) {}

   };


   /// A wrapper for converting a boost::bind() unspecified-object into an object suitable for transferring into a thread_pool.

   template<class BindFn>

   class wrap_boost_bind_t {

   public:

      typedef BindFn operation_type;

      typedef typename operation_type::result_type result_type;


      /**

         \param   fn An object returned from calling boost::bind. There must be no unbound arguments in the ctor, otherwise this will fail to compile. Also the called function must return an instance of the result_type object, not void.

      */

      explicit __stdcall wrap_boost_bind_t(operation_type &&fn) noexcept(noexcept(operation_type(std::declval<operation_type>()))) FORCE_INLINE

      : boost_fn(fn) {

      }


      void __fastcall process(result_type &res) FORCE_INLINE {

         res=boost_fn();

      }


      template<class BindFn1> constexpr bool __fastcall FORCE_INLINE

      operator<(BindFn1 const &) const noexcept(true) {

         return true;

      }


   private:

      operation_type boost_fn;

   };


   /// A wrapper for converting a boost::bind() unspecified-object into an object suitable for transferring into a thread_pool.

   template<class BindFn>

   class wrap_std_bind_t {

   public:

      typedef BindFn operation_type;

      typedef typename operation_type::result_type result_type;


      /**

         \param   fn An object returned from calling std::bind. There must be no unbound arguments in the ctor, otherwise this will fail to compile. Also the called function must return an instance of the result_type object, not void.

      */

      explicit __stdcall wrap_std_bind_t(operation_type &&fn) noexcept(noexcept(operation_type(std::declval<operation_type>()))) FORCE_INLINE

      : std_fn(fn) {

      }


      void __fastcall process(result_type &res) FORCE_INLINE {

         res=std_fn();

      }


      template<class BindFn1> constexpr bool __fastcall FORCE_INLINE

      operator<(BindFn1 const &) const noexcept(true) {

         return true;

      }


   private:

      operation_type std_fn;

   };


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      If this is used to add work to a pool, and an execution_context does not capture the result, then the result is UB.


      \todo Clearly the above is undesirable, and it would be better if we could somehow force the compiler to emit an error if the user fails to capture the result.

   */

   template<class TPB>

   class joinable_t;

   /// A modifier to allow non-joinably transferring the work to the pool.

   template<class TPB>

   class nonjoinable_t;

   /// A modifier to allow setting the kernel-level priority that will be used by the thread whilst processing the work.

   /**

      \param Pri  The priority at which the work should be executed. Note that this priority is only used whilst the work is being executed, and after the work is completed, or an exception is thrown, the thread resets to the default priority of the thread pool.

   */

   template<template<class> class Joinability, class TPB, typename TPB::priority_type Pri>

   class priority_t;

   template<class TPB, typename TPB::priority_type Pri>

   class priority_t<joinable_t, TPB, Pri> : private joinable_t<TPB> {

   public:

      typedef joinable_t<TPB> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type;

      typedef typename base_t::os_traits os_traits;

      typedef typename base_t::priority_type priority_type;


      static constexpr priority_type priority=Pri;


      constexpr priority_t(thread_pool_type &p, typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const &cfg_p) noexcept(true) FORCE_INLINE

      : base_t(p, 1, cfg_p) {

      }

      constexpr priority_t(thread_pool_type &p, cliques::element_type const &c, typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const &cfg_p) noexcept(true) FORCE_INLINE

      : base_t(p, c, cfg_p) {

      }


      /// Transfer the closure_base-derived closure to be process()ed at the specified priority, specified by the template parameter.

      /**

         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \param wk   The closure_base-derived closure, created from the InpWk, to be asynchronously executed. The result_type of the closure_base-derived closure is taken from a member typedef. The default mutator function is called process(result_type &) or process(), but you could provide an alternative member-function name if desired, as long as the signature is correct via the declaration of mutator. Note that the process(result_type &) or process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it.

         \return  An opaque execution_context_stack which is captured by "auto const &" or "auto &&", for requesting the results from the asynchronously process()'d closure_base-derived closure.


         \see create_direct, execution_context_stack, kernel_priority::closure

      */

      template<

         class InpWk,

         class Ret=kernel_priority::work<thread_pool_type, Pri, typename thread_pool_type::template create_direct<InpWk>::process_fn_ptr>

      >

      typename thread_pool_type::template execution_context_stack<Ret> FORCE_INLINE

      push_back(InpWk &&wk) noexcept(false) {

         return base_t::push_back(Ret(std::forward<InpWk>(wk)));

      }


      template<

         class InpWk,

         class Ret=kernel_priority::work<thread_pool_type, Pri, typename thread_pool_type::template create_direct<InpWk>::process_fn_ptr>

      >

      typename thread_pool_type::template execution_context_stack<Ret> FORCE_INLINE

      operator<<(InpWk &&wk) noexcept(false) {

         return push_back(std::forward<InpWk>(wk));

      }


      template<

         class R,

         class F,

         class L

      >

      typename thread_pool_type::template execution_context_stack<

         kernel_priority::work<

            thread_pool_type, Pri, typename thread_pool_type::template create_direct<

               wrap_boost_bind_t<boost::_bi::bind_t<R, F, L>>

            >

         >

      > FORCE_INLINE

      operator<<(boost::_bi::bind_t<R, F, L> &&wk) noexcept(false) {

         return push_back(wrap_boost_bind_t<boost::_bi::bind_t<R, F, L> >(std::forward<boost::_bi::bind_t<R, F, L>>(wk)));

      }


      template<

         template<class, class, class> class B,

         class R,

         class F,

         class L,

         class Test=typename std::enable_if<std::is_bind_expression<B<R, F, L>>::value>::type

      >

      typename thread_pool_type::template execution_context_stack<

         kernel_priority::work<

            thread_pool_type, Pri, typename thread_pool_type::template create_direct<

               wrap_std_bind_t<B<R, F, L>>

            >

         >

      > FORCE_INLINE

      operator<<(B<R, F, L> &&wk) noexcept(false) {

         return push_back(wrap_std_bind_t<B<R, F, L> >(std::forward<B<R, F, L>>(wk)));

      }

   };

   template<class TPB, typename TPB::priority_type Pri>

   class priority_t<nonjoinable_t, TPB, Pri> : private nonjoinable_t<TPB> {

   public:

      typedef nonjoinable_t<TPB> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type;

      typedef typename base_t::os_traits os_traits;

      typedef typename base_t::priority_type priority_type;


      static constexpr priority_type priority=Pri;


      constexpr priority_t(thread_pool_type &p, typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const &cfg_p) noexcept(true) FORCE_INLINE

      : base_t(p, 1, cfg_p) {

      }

      constexpr priority_t(thread_pool_type &p, cliques::element_type const &c, typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const &cfg_p) noexcept(true) FORCE_INLINE

      : base_t(p, c, cfg_p) {

      }


      /// Transfer the closure_base-derived closure to be process()ed at the specified priority, specified by the template parameter.

      /**

         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \param wk   The closure_base-derived closure to be asynchronously executed, that must be copy-constructible. The result_type is inferred from the process(result_type &) or process() member-functions declared in the Wk type. Note that the process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it. The default mutator function is called process(), but you could provide an alternative member-function name if desired, as long as the signature is correct via the declaration of create_direct.

         \return     A reference to the pool to allow chaining.


         \see create_direct, thread_wk_t, kernel_priority::work

      */

      template<class InpWk>

      thread_pool_type & __fastcall FORCE_INLINE

      push_back(InpWk &&wk) noexcept(false) {

         typedef typename thread_pool_type::template create_direct<InpWk> creator_t;

         typedef kernel_priority::work<thread_pool_type, Pri, typename creator_t::process_fn_ptr> pri_work_t;


         return base_t::push_back(pri_work_t(std::forward<InpWk>(wk)));

      }


      template<

         class R,

         class F,

         class L

      >

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(boost::_bi::bind_t<R, F, L> &&wk) noexcept(false) {

         return push_back(wrap_boost_bind_t<boost::_bi::bind_t<R, F, L> >(std::forward<boost::_bi::bind_t<R, F, L>>(wk)));

      }


      template<

         template<class, class, class> class B,

         class R,

         class F,

         class L,

         class Test=typename std::enable_if<std::is_bind_expression<B<R, F, L>>::value>::type

      >

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(B<R, F, L> &&wk) noexcept(false) {

         return push_back(wrap_std_bind_t<B<R, F, L> >(std::forward<B<R, F, L>>(wk)));

      }


      template<class InpWk>

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(InpWk &&wk) noexcept(false) {

         return push_back(std::forward<InpWk>(wk));

      }


      /// We don't support priorities on closure_base-derived closure that is a parallel_algorithm.

      template<class Alg>

      thread_pool_type & __fastcall

      push_back(parallel_algorithm<Alg> &&)=delete;

      /// We don't support priorities on closure_base-derived closure that is a parallel_algorithm.

      template<class Alg>

      thread_pool_type & __fastcall

      operator<<(parallel_algorithm<Alg> &&)=delete;

   };


   template<class Base>

   class cliques_t;

   template<class TPB>

   class cliques_t<joinable_t<TPB> > : private joinable_t<TPB> {

   public:

      typedef joinable_t<TPB> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type;

      typedef typename base_t::os_traits os_traits;

      typedef cliques::element_type element_type;


      constexpr cliques_t() noexcept(true) FORCE_INLINE {

      }

      constexpr cliques_t(thread_pool_type &p, cliques::element_type const &c, typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const &cfg_p) noexcept(true) FORCE_INLINE

      : base_t(p, c, cfg_p) {

      }


      /// Transfer the closure_base-derived closure with the appropriate priority, specified by the template parameter.

      /**

         This operation requires no memory allocations, in addition to those required for the operation of the parallel algorithm.


         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \param wk   The closure_base-derived closure to be asynchronously executed, that must be copy-constructible. The result_type is inferred from the process(result_type &) or process() member-functions declared in the Wk type. Note that the process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it. The default mutator function is called process(), but you could provide an alternative member-function name if desired, as long as the signature is correct via the declaration of create_direct.

         \return An opaque type, derived from an execution_context, that must be captured using "auto const &" or "auto &&"; for requesting the results from the asynchronously process()'d closure_base-derived closure.


         \see create_direct, execution_context

      */

      template<class Alg>

      typename Alg::execution_context FORCE_INLINE

      push_back(parallel_algorithm<Alg> &&wk) noexcept(false) {

         return base_t::push_back(std::forward<parallel_algorithm<Alg>>(wk));

      }


      template<class Alg>

      typename Alg::execution_context __fastcall FORCE_INLINE

      operator<<(parallel_algorithm<Alg> &&wk) noexcept(false) {

         return push_back(std::forward<parallel_algorithm<Alg>>(wk));

      }


      /// We don't support cliques on closure_base-derived closure that is not a parallel_algorithm.

      template<class InpWk>

      void __fastcall

      push_back(InpWk &&)=delete;

      /// We don't support cliques on closure_base-derived closure that is not a parallel_algorithm.

      template<class InpWk>

      void __fastcall

      operator<<(InpWk &&)=delete;

   };

   template<class TPB>

   class cliques_t<nonjoinable_t<TPB> > final : private nonjoinable_t<TPB> {

   public:

      typedef nonjoinable_t<TPB> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type;

      typedef typename base_t::os_traits os_traits;

      typedef cliques::element_type element_type;


      constexpr cliques_t(thread_pool_type &p, cliques::element_type const &c, typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const &cfg_p) noexcept(true) FORCE_INLINE

      : base_t(p, c, cfg_p) {

      }


      /// We don't support cliques on closure_base-derived closure that is not a parallel_algorithm.

      template<class InpWk>

      thread_pool_type & __fastcall

      push_back(InpWk &&)=delete;


      /// Transfer the closure_base-derived closure with the appropriate priority, specified by the template parameter.

      /**

         This operation requires 2 memory allocations, in addition to those required for the operation of the parallel algorithm.


         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \param wk   The closure_base-derived closure to be asynchronously executed, that must be copy-constructible. The result_type is inferred from the process(result_type &) or process() member-functions declared in the Wk type. Note that the process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it. The default mutator function is called process(), but you could provide an alternative member-function name if desired, as long as the signature is correct via the declaration of create_direct.

         \return     A reference to the pool to allow chaining.


         \see create_direct

      */

      template<class Alg>

      thread_pool_type & __fastcall FORCE_INLINE

      push_back(parallel_algorithm<Alg> &&wk) noexcept(false) {

         return base_t::push_back(std::move(wk));

      }


      /// We don't support cliques on closure_base-derived closure that is not a parallel_algorithm.

      template<class InpWk>

      thread_pool_type & __fastcall

      operator<<(InpWk &&)=delete;


      template<class Alg>

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(parallel_algorithm<Alg> &&wk) noexcept(false) {

         return push_back(std::forward<parallel_algorithm<Alg>>(wk));

      }

   };


   template<class TPB>

   class joinable_t {

   public:

      typedef TPB thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;

      typedef typename thread_pool_type::os_traits os_traits;

      typedef typename thread_pool_type::priority_type priority_type;


      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::joinable;


      explicit constexpr joinable_t(const cliques::element_type c=1) noexcept(true) FORCE_INLINE

      : pool(), num_cliques(c), cfg_parms("joinable") {

      }

      template<class T> constexpr FORCE_INLINE

      joinable_t(T const * parent, typename pool_traits_type::thread_wk_elem_type::cfg_type::node_property_t::value_type const details[], const cliques::element_type c=1) noexcept(true)

      : pool(), num_cliques(c), cfg_parms(parent, details) {

      }

      constexpr joinable_t(thread_pool_type &p, const cliques::element_type c, typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const &cfg_p) noexcept(true) FORCE_INLINE

      : pool(&p), num_cliques(c), cfg_parms(pool->cfg(), cfg_p) {

      }

      constexpr joinable_t(joinable_t const &j, thread_pool_type &p) noexcept(true) FORCE_INLINE

      : pool(&p), num_cliques(j.num_cliques), cfg_parms(pool->cfg(), j.cfg_parms) {

      }


      /// Joinably transfer the closure_base-derived closure to the thread_pool.

      /**

         Verify that the closure_base-derived closure has not been previously transferred, if it has, throw an exception_type. If the implementation-defined result is not captured, the transferred closure_base-derived closure will not be process()ed.


         This operation requires no memory allocations, in addition to those required for the operation of the parallel algorithm.


         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \param wk   The closure_base-derived closure, to be asynchronously executed. The result_type of the closure_base-derived closure is taken from a member typedef. The default mutator function is called process(result_type &) or process(), but you could provide an alternative member-function name if desired, as long as the signature is correct via the declaration of mutator. Note that the process(result_type &) or process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it.

         \return An opaque type, derived from an execution_context_stack, that must be captured using "auto const &" or "auto &&"; for requesting the results from the asynchronously process()'d closure_base-derived closure.


         \see create_direct, execution_context_stack

      */

      template<class InpWk>

      typename thread_pool_type::template execution_context_stack<InpWk> __fastcall FORCE_INLINE

      push_back(InpWk &&wk) noexcept(false) {

         typedef typename thread_pool_type::template execution_context_stack<InpWk> exec_ctx_t;


         return exec_ctx_t(*pool, cfg_parms, std::forward<InpWk>(wk));

      }

      /**

         This operation requires no memory allocations, in addition to those required for the operation of the parallel algorithm.


         \return An opaque type, derived from an execution_context, that must be captured using "auto const &" or "auto &&"; for requesting the results from the asynchronously process()'d closure_base-derived closure.

      */

      template<class Alg>

      typename Alg::execution_context __fastcall FORCE_INLINE

      push_back(parallel_algorithm<Alg> &&wk) noexcept(false) {

         return wk.process(num_cliques, cfg_parms);

      }


      template<

         class R,

         class F,

         class L

      >

      typename thread_pool_type::template execution_context_stack<wrap_boost_bind_t<boost::_bi::bind_t<R, F, L>>> FORCE_INLINE

      operator<<(boost::_bi::bind_t<R, F, L> &&wk) noexcept(false) {

         return push_back(wrap_boost_bind_t<boost::_bi::bind_t<R, F, L> >(std::forward<boost::_bi::bind_t<R, F, L>>(wk)));

      }


      template<

         template<class, class, class> class B,

         class R,

         class F,

         class L,

         class Test=typename std::enable_if<std::is_bind_expression<B<R, F, L>>::value>::type

      >

      typename thread_pool_type::template execution_context_stack<wrap_std_bind_t<B<R, F, L>>> FORCE_INLINE

      operator<<(B<R, F, L> &&wk) noexcept(false) {

         return push_back(wrap_std_bind_t<B<R, F, L> >(std::forward<B<R, F, L>>(wk)));

      }


      template<class InpWk>

      typename thread_pool_type::template execution_context_stack<InpWk> FORCE_INLINE

      operator<<(InpWk &&wk) noexcept(false) {

         typedef typename thread_pool_type::template execution_context_stack<InpWk> exec_ctx_t;


         return exec_ctx_t(*pool, cfg_parms, std::move(wk));

      }

      template<class Alg>

      typename Alg::execution_context __fastcall FORCE_INLINE

      operator<<(parallel_algorithm<Alg> &&wk) noexcept(false) {

         return push_back(std::forward<parallel_algorithm<Alg>>(wk));

      }


      /// Joinably transfer the closure_base-derived closure, which will be process()ed at the specified priority.

      /**

         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \see priority, priority_t

      */

      template<

         typename thread_pool_type::priority_type Pri ///< The priority at which the work should be executed. Note that this priority is only used whilst the work is being executed, and after the work is completed, or an exception is thrown, the thread resets to the default priority of the thread pool. Note that this parameter is unused: it is the template-parameter that is used to specify the priority.

      >

      priority_t<joinable_t, TPB, Pri> __fastcall FORCE_INLINE

      operator<<(typename thread_pool_type::template priority<Pri>) noexcept(true) {

         return priority_t<joinable_t, TPB, Pri>(*pool, cfg_parms);

      }


      /// Joinably transfer the closure_base-derived closure, using a sub-set of the pool_threads within the thread_pool.

      /**

         \param c The number of cliques into which the thread_pool should be divided.


         \see cliques, cliques_t

      */

      cliques_t<joinable_t> __fastcall FORCE_INLINE

      operator<<(cliques &&c) noexcept(true) {

         return cliques_t<joinable_t>(*pool, c.number, cfg_parms);

      }


   private:

      thread_pool_type * const pool;

      const cliques::element_type num_cliques;

      typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const cfg_parms;

   };


   template<class TPB>

   class nonjoinable_t {

   public:

      typedef TPB thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;

      typedef typename thread_pool_type::os_traits os_traits;

      typedef typename thread_pool_type::priority_type priority_type;


      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::nonjoinable;


      constexpr nonjoinable_t(const cliques::element_type c=1) noexcept(true) FORCE_INLINE

      : pool(), num_cliques(c), cfg_parms("nonjoinable") {

      }

      template<class T> constexpr FORCE_INLINE

      nonjoinable_t(T const * parent, typename pool_traits_type::thread_wk_elem_type::cfg_type::node_property_t::value_type const details[], const cliques::element_type c=1) noexcept(true)

      : pool(), num_cliques(c), cfg_parms(parent, details) {

      }

      constexpr nonjoinable_t(thread_pool_type &p, const cliques::element_type c, typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const &cfg_p) noexcept(true) FORCE_INLINE

      : pool(&p), num_cliques(c), cfg_parms(pool->cfg(), cfg_p) {

      }

      constexpr nonjoinable_t(nonjoinable_t const &nj, thread_pool_type &p) noexcept(true) FORCE_INLINE

      : pool(&p), num_cliques(nj.num_cliques), cfg_parms(pool->cfg(), nj.cfg_parms) {

      }


      /// Transfer the closure_base-derived closure into the thread_pool, non-joinably.

      /**

         No need to verify that the closure_base-derived closure has not been previously transferred.


         This operation requires at most 1 memory allocation.


         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \param wk   The closure_base-derived closure to be asynchronously executed, that must be copy-constructible. The result_type is inferred from the process(result_type &) or process() member-functions declared in the Wk type. Note that the process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it. The default mutator function is called process(), but you could provide an alternative member-function name if desired, as long as the signature is correct via the declaration of create_direct.

         \return     A reference to the pool to allow chaining.


         \see create_direct

      */

      template<class InpWk>

      thread_pool_type & __fastcall FORCE_INLINE

      push_back(InpWk &&wk) noexcept(false) {

         typedef typename thread_pool_type::template create_direct<InpWk> creator_t;

         typedef typename pool_traits_type::template thread_wk<result_traits_, typename creator_t::closure_t, typename os_traits::lock_traits::anon_event_type, default_delete, os_traits::lock_traits::template atomic_counter_type> thread_wk_t;


         assert(dynamic_cast<thread_pool_type *>(pool));


         pool->add_nonjoinable_work(

            typename pool_traits_type::template signalled_work_queue_type<typename thread_pool_type::work_distribution_mode::queue_model>::value_type(

               new thread_wk_t(typename creator_t::closure_t::argument_type(std::forward<InpWk>(wk)), cfg_parms)

            )

         );

         return *pool;

      }

      /**

         This operation requires at most 2 memory allocations, in addition to those required for the operation of the parallel algorithm.

      */

      template<class Alg>

      thread_pool_type & __fastcall FORCE_INLINE

      push_back(parallel_algorithm<Alg> &&wk) noexcept(false) {

         return wk.process(num_cliques, cfg_parms);

      }


      template<class R, class F, class L>

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(boost::_bi::bind_t<R, F, L> &&wk) noexcept(false) {

         return push_back(wrap_boost_bind_t<boost::_bi::bind_t<R, F, L> >(std::forward<boost::_bi::bind_t<R, F, L>>(wk)));

      }


      template<

         template<class, class, class> class B,

         class R,

         class F,

         class L,

         class Test=typename std::enable_if<std::is_bind_expression<B<R, F, L>>::value>::type

      >

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(B<R, F, L> &&wk) noexcept(false) {

         return push_back(wrap_std_bind_t<B<R, F, L> >(std::forward<B<R, F, L>>(wk)));

      }


      template<class InpWk>

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(InpWk &&wk) noexcept(false) {

         return push_back(std::forward<InpWk>(wk));

      }

      template<class Alg>

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(parallel_algorithm<Alg> &&wk) noexcept(false) {

         return push_back(std::forward<parallel_algorithm<Alg>>(wk));

      }


      /// Non-joinably transfer the closure_base-derived closure, which will be process()ed at the specified priority.

      /**

         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \see priority, priority_t

      */

      template<

         typename thread_pool_type::priority_type Pri ///< The priority at which the work should be executed. Note that this priority is only used whilst the work is being executed, and after the work is completed, or an exception is thrown, the thread resets to the default priority of the thread pool. Note that this parameter is unused: it is the template-parameter that is used to specify the priority.

      >

      priority_t<nonjoinable_t, TPB, Pri> __fastcall FORCE_INLINE

      operator<<(typename thread_pool_type::template priority<Pri>) noexcept(true) {

         return priority_t<nonjoinable_t, TPB, Pri>(*pool, cfg_parms);

      }


      /// Non-joinably transfer the closure_base-derived closure, using a sub-set of the pool_threads within the thread_pool.

      /**

         \param c The number of cliques into which the thread_pool should be divided.


         \see cliques, cliques_t

      */

      cliques_t<nonjoinable_t> __fastcall

      operator<<(cliques &&c) noexcept(true) FORCE_INLINE {

         return cliques_t<nonjoinable_t>(*pool, c.number, cfg_parms);

      }


   private:

      thread_pool_type * const pool;

      const cliques::element_type num_cliques;

      typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const cfg_parms;

   };


   /**

      This class transfers the closure_base-derived closure generated by the parallel algorithms within subdivide_n_gen_wk::process() into the thread_pool, but the work is allocated within the custom memory-buffer allocated within the algo_thread_wk_buffered that encapsulates the result of the parallel algorithm.


      \see algo_thread_wk_buffered, subdivide_n_gen_wk::process()

   */

   template<class TPB>

   class nonjoinable_buff_t final {

   public:

      typedef TPB thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;

      typedef typename thread_pool_type::os_traits os_traits;

      typedef typename thread_pool_type::priority_type priority_type;

      typedef unsigned char * buffer_type;


      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::nonjoinable;


      /**

         \param b A contiguous array of suitably aligned bytes of appropriate size for allocating the core_work items within it.

      */

      explicit constexpr nonjoinable_buff_t(buffer_type const b) noexcept(true) FORCE_INLINE

      : buffer(b), pool(), cfg_parms("nonjoinable_buff") {

      }

      template<class T> constexpr FORCE_INLINE

      nonjoinable_buff_t(buffer_type const b, T const * parent, typename pool_traits_type::thread_wk_elem_type::cfg_type::node_property_t::value_type const details[]) noexcept(true)

      : buffer(b), pool(), cfg_parms(parent, details) {

      }

      constexpr nonjoinable_buff_t(nonjoinable_buff_t const &njb, thread_pool_type &p) noexcept(true) FORCE_INLINE

      : buffer(njb.buffer), pool(&p), cfg_parms(pool->cfg(), njb.cfg_parms) {

      }


      /// Transfer the closure_base-derived closure into the thread_pool, non-joinably.

      /**

         No need to verify that the closure_base-derived closure has not been previously transferred.


         This operation requires no memory allocations, in addition to those required for the operation of the parallel algorithm.


         \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


         \param wk   The closure_base-derived closure to be asynchronously executed, that must be copy-constructible. The result_type is inferred from the process(result_type &) or process() member-functions declared in the Wk type. Note that the process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it. The default mutator function is called process(), but you could provide an alternative member-function name if desired, as long as the signature is correct via the declaration of create_direct.

         \return     A reference to the pool to allow chaining.


         \see create_direct

      */

      template<class InpWk>

      thread_pool_type & __fastcall FORCE_INLINE

      push_back(InpWk &&wk) noexcept(false) {

         typedef typename thread_pool_type::template create_direct<InpWk> creator_t;

         typedef typename pool_traits_type::template thread_wk<result_traits_, typename creator_t::closure_t, typename os_traits::lock_traits::anon_event_type, placement_dtor, os_traits::lock_traits::template atomic_counter_type> thread_wk_t;


         assert(dynamic_cast<thread_pool_type *>(pool));

         assert(std::accumulate(buffer, buffer+sizeof(thread_wk_t), 0UL)==0UL);

         // Hurrah! See: we just saved loads of calls to global new & delete by using placement new here!

         pool->add_nonjoinable_work(

            typename pool_traits_type::template signalled_work_queue_type<typename thread_pool_type::work_distribution_mode::queue_model>::value_type(

               new (buffer) thread_wk_t(typename creator_t::closure_t::argument_type(std::forward<InpWk>(wk)), cfg_parms)

            )

         );

         return *pool;

      }


      template<class InpWk>

      thread_pool_type & __fastcall FORCE_INLINE

      operator<<(InpWk &&wk) noexcept(false) {

         return push_back(std::forward<InpWk>(wk));

      }


   private:

      buffer_type const buffer;

      thread_pool_type * const pool;

      typename pool_traits_type::thread_wk_elem_type::cfg_details_type::params const cfg_parms;

   };


}


} } }


#endif