doxygen/html/thread__client__context_8hpp_source.html

#ifndef LIBJMMCG_CORE_PRIVATE_THREAD_CLIENT_CONTEXT_HPP

#define LIBJMMCG_CORE_PRIVATE_THREAD_CLIENT_CONTEXT_HPP

/******************************************************************************

** Copyright © 2004 by J.M.McGuiness, coder@hussar.me.uk

**

** This library is free software; you can redistribute it and/or

** modify it under the terms of the GNU Lesser General Public

** License as published by the Free Software Foundation; either

** version 2.1 of the License, or (at your option) any later version.

**

** This library is distributed in the hope that it will be useful,

** but WITHOUT ANY WARRANTY; without even the implied warranty of

** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

** Lesser General Public License for more details.

**

** You should have received a copy of the GNU Lesser General Public

** License along with this library; if not, write to the Free Software

** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

*/


#include "../../core/dynamic_cast.hpp"

#include "../../core/non_allocatable.hpp"

#include "../../core/thread_api_traits.hpp"

#include "thread_work_closure.hpp"

#include "../../core/thread_wrapper.hpp"

#include "../../core/exception.hpp"

#include "dsel_core_work_creation.hpp"


#include <array>

#include <functional>

#include <memory>


namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace ppd { namespace private_ {


   inline constexpr char gen_wk_node_str[]="distribute";

   inline constexpr char gen_wk_node_root_str[]="distribute_root";

   inline constexpr char algo_reduction_str[]="algo_reduction";


   /**

      Compute the static size of the largest object allocated in the subdivide_n_gen_wk* algorithms, including itself. This is used to compute the size of the memory buffer allocated in algo_thread_wk_buffered into which the objects will be placement new'd.

   */

   template<class ParAlg>

   struct largest_par_alg_obj {

      typedef ParAlg gen_wk_t;

      typedef typename gen_wk_t::thread_pool_type::template create_direct<gen_wk_t> subdiv_algo_work_creator_t;

      typedef typename gen_wk_t::pool_traits_type::template thread_wk<generic_traits::return_data::nonjoinable, typename subdiv_algo_work_creator_t::closure_t, typename gen_wk_t::os_traits::lock_traits::anon_event_type, placement_dtor, gen_wk_t::thread_pool_type::os_traits::lock_traits::template atomic_counter_type> subdiv_algo_work_t;

      typedef typename gen_wk_t::thread_pool_type::template create_direct<typename gen_wk_t::alg_wrap_t> alg_wrap_creator_t;

      typedef typename gen_wk_t::pool_traits_type::template thread_wk<generic_traits::return_data::nonjoinable, typename alg_wrap_creator_t::closure_t, typename gen_wk_t::os_traits::lock_traits::anon_event_type, placement_dtor, gen_wk_t::thread_pool_type::os_traits::lock_traits::template atomic_counter_type> alg_wrap_work_t;

      typedef typename std::conditional<(sizeof(subdiv_algo_work_t)>sizeof(alg_wrap_work_t)), subdiv_algo_work_t, alg_wrap_work_t>::type type;

   };


   /// The internal class that does the priority setting and restoration as RAII.

   /**

      \see set_priority_closure, set_priority_work

   */

   template<

      class TT,

      typename TT::api_params_type::priority_type new_priority ///< The priority at which the wrapped work should run.

   >

   class setter final {

   public:

      typedef TT thread_traits;

      typedef typename thread_traits::api_params_type::priority_type priority_type;


      void set() noexcept(true) FORCE_INLINE {

         thread_traits::set_kernel_priority(thread, new_priority);

      }

      void reset() noexcept(true) FORCE_INLINE {

         thread_traits::set_kernel_priority(thread, orig_pri);

      }


      explicit __stdcall setter(typename thread_traits::api_params_type::handle_type thr) noexcept(true) FORCE_INLINE

      : thread(thr), orig_pri(thread_traits::get_kernel_priority(thread)) {

         set();

      }

      setter(setter const &)=delete;

      __stdcall ~setter() noexcept(true) FORCE_INLINE {

         reset();

      }


   private:

      const typename thread_traits::api_params_type::handle_type thread;   ///< The identifier for the thread that should be manipulated.

      const priority_type orig_pri; ///< The original priority of the thread, that should be restored.

   };


   /// A class to assist in processing the core_work in the GSS(k) batch, ensuring that items within a batch in a thread are fully processed before any wait in that thread is performed, because we could deadlock.

   /**

      Note that this class is used for the implicit batch contained formed by the main thread in the program that creates the thread_pool.


      \see execution_context_type, thread_pool, pool_traits::work_distribution_mode_t::worker_threads_get_work<pool_traits::work_distribution_mode_t::queue_model_t::pool_owns_queue>, pool_thread, generic_traits::return_data::joinable

   */

   template<

      unsigned long GSSkSz,

      class WQ,

      class Stats

   >

   class batch_details {

   public:

      /// This is a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.

      typedef WQ signalled_work_queue_type;

      /// The statistics to be gathered by the thread_pool, by default none.

      typedef Stats statistics_type;

      typedef typename signalled_work_queue_type::value_ret_type::value_type::no_ref_counting no_ref_counting;

      typedef typename signalled_work_queue_type::value_ret_type::value_type::value_type::cfg_type cfg_type;

      using eval_shared_del_t=eval_shared_deleter_t<typename signalled_work_queue_type::value_ret_type::value_type>;


      static constexpr unsigned long GSSk=GSSkSz;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=eval_shared_del_t::memory_access_mode;


      /**

         Make sure that the batch_size's are the same, so that when a batch is popped off the signalled_work_queue, all of the elements are copied, with none being lost, which would cause closure_base-derived closure not to be processed.


         \see pool_aspect::GSSk

      */

      BOOST_STATIC_ASSERT(signalled_work_queue_type::max_size<=GSSk);


      /**

         \return true if the batch is empty, false otherwise.

      */

      bool __fastcall batch_empty() const noexcept(true) FORCE_INLINE {

         return current_work_item==batched_work.end() || !current_work_item->get();

      }


   protected:

      void __fastcall reload_batch_if_empty_nochk_nolk(signalled_work_queue_type &signalled_work_queue) noexcept(false) FORCE_INLINE {

         if (batch_empty()) {

            batched_work=signalled_work_queue.pop_front_nolk();

            // We always have at least one item in the batch.

            current_work_item=batched_work.begin();

         }

      }

      template<class UpdStats>

      void FORCE_INLINE

      process_the_work(UpdStats &&update_stats, typename cfg_type::edge_annotation_t const e_details) noexcept(false) {

         eval_shared_del_t work(*current_work_item);

         ++current_work_item;

         work.process_the_work(std::forward<UpdStats>(update_stats), e_details);

      }


   public:

      constexpr __stdcall batch_details() noexcept(true) FORCE_INLINE

      : statistics_(), current_work_item(batched_work.end()) {}


      /// Put the closure_base-derived closure in the batch, if it is empty.

      /**

         Note that this function runs with no locks, as it presumes that the caller is the same pool_thread that consumes the work from the batch.


         \param   wk The closure_base-derived closure to attempt to add.

         \return  true if the closure_base-derived closure was added, false otherwise.

      */

#pragma GCC diagnostic push

// There's no sensible way to provide a simple specialisation that can remove the warning, which is due to possible use of GSS(k) batching in the queue, so we have to just ignore this warning.

#pragma GCC diagnostic ignored "-Wmissing-braces"

      bool __fastcall add_work_to_batch(typename signalled_work_queue_type::value_type &&wk) noexcept(true) FORCE_INLINE {

         if (batch_empty()) {

            batched_work=typename signalled_work_queue_type::value_ret_type{wk};

            // We always have at least one item in the batch.

            current_work_item=batched_work.begin();

            return true;

         } else {

            return false;

         }

      }

#pragma GCC diagnostic pop


      void __fastcall refill_batch(signalled_work_queue_type &signalled_work_queue) noexcept(false) FORCE_INLINE {

         /*

            1. Need to re-add an event as we consumed one to get here, so we'll incorrectly lead the queue to think it has one less work item than it really has....

            2. Must not appear inside the lock, otherwise we can get deadlocks, because there is more work in the queue than the counter says.

            3. Re-adding outside the lock means that another (horizontal) thread could steal this work because adding work may trigger another thread, and they race to get here. So add just before we lock to try and reduce this possibility.

         */

         signalled_work_queue.have_work.add();

         const typename signalled_work_queue_type::atomic_t::write_lock_type work_queue_lk(signalled_work_queue.pop_lock(), signalled_work_queue_type::atomic_t::lock_traits::infinite_timeout());

         if (!signalled_work_queue.colln().empty()) {

            reload_batch_if_empty_nochk_nolk(signalled_work_queue);

         }

      }

      // TODO Should process only one batch item at a time....

      bool __fastcall process_a_batch_item() noexcept(false) FORCE_INLINE {

         while (!batch_empty()) {

            process_the_work(std::bind(&statistics_type::processed_hrz_work, &statistics_), cfg_type::hrz_edge_annotation);

         }

         return current_work_item!=batched_work.end();

      }

      /**

         If the batch_size>1 and the first closure_base-derived closure depends upon a later job to complete, then that sub-tree of dependent closure_base-derived closures will deadlock. This is because this loop will wait for the first closure_base-derived closure to complete, which depends upon the second (or later in the batch) closure_base-derived closure which will not be executed as the earlier closure_base-derived closure is preventing this loop for continuing.


         \see process_a_batch_item

      */

      void __fastcall process_a_batch(signalled_work_queue_type &signalled_work_queue) noexcept(false) FORCE_INLINE {

         refill_batch(signalled_work_queue);

         while (!batch_empty()) {

            process_the_work(std::bind(&statistics_type::processed_vertical_work, &statistics_), cfg_type::vertical_edge_annotation);

         }

      }


      statistics_type const &__fastcall statistics() const noexcept(true) FORCE_INLINE {

         return statistics_;

      }

      statistics_type &__fastcall statistics() noexcept(true) FORCE_INLINE {

         return statistics_;

      }


   protected:

      statistics_type statistics_;

      typename signalled_work_queue_type::value_ret_type batched_work;

      typename signalled_work_queue_type::value_ret_type::iterator current_work_item;

   };

   /// Optimisation for when GSS(k)=GSS(1), i.e. no batching.

   template<

      class WQ,

      class Stats

   >

   class batch_details<1UL, WQ, Stats> {

   public:

      /// This is a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.

      typedef WQ signalled_work_queue_type;

      /// The statistics to be gathered by the thread_pool, by default none.

      typedef Stats statistics_type;

      typedef typename signalled_work_queue_type::value_type::no_ref_counting no_ref_counting;

      typedef typename signalled_work_queue_type::value_type::value_type::cfg_type cfg_type;

      typedef typename signalled_work_queue_type::value_type::value_type::work_complete_t work_complete_t;

      using eval_shared_del_t=eval_shared_deleter_t<typename signalled_work_queue_type::value_type>;


      static constexpr unsigned long GSSk=1UL;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=eval_shared_del_t::memory_access_mode;


      /**

         Make sure that the batch_size's are the same, so that when a batch is popped off the signalled_work_queue, all of the elements are copied, with none being lost, which would cause closure_base-derived closure not to be processed. Put this check in here just because I'm ultra-cautious at the moment, as it is strictly not needed, but the horizontal threading test case in dataflow_full fails with GSSk>1 as it is a poor test case.


         \see pool_aspect::GSSk

      */

      BOOST_STATIC_ASSERT(signalled_work_queue_type::max_size<=GSSk);


      /**

         \return true if the batch is empty, false otherwise.

      */

      bool __fastcall batch_empty() const noexcept(true) FORCE_INLINE {

         return !current_work;

      }


   protected:

      void __fastcall reload_batch_if_empty_nochk_nosig(signalled_work_queue_type &signalled_work_queue) noexcept(false) FORCE_INLINE {

         if (batch_empty() && !signalled_work_queue.empty()) {

            // Make sure that when the batch is popped off the signalled_work_queue, only one element is popped, otherwise the extra would be lost, which would cause the lost closure_base-derived closure not to be processed and execution_context's never to be satisfied.

            current_work=signalled_work_queue.pop_front_1_nochk_nosig();

         }

      }

      template<class UpdStats>

      void FORCE_INLINE

      process_the_work(UpdStats &&update_stats, typename cfg_type::edge_annotation_t const e_details) noexcept(false) {

         eval_shared_deleter_t<typename signalled_work_queue_type::value_ret_type::value_type> work(current_work);

         work.process_the_work(std::forward<UpdStats>(update_stats), e_details);

      }


   public:

      constexpr __stdcall batch_details() noexcept(true) FORCE_INLINE

      : statistics_(), current_work() {}


      /// Put the closure_base-derived closure in the batch, if it is empty.

      /**

         Note that this function runs with no locks, as it presumes that the caller is the same pool_thread that consumes the work from the batch.


         \param   wk The closure_base-derived closure to attempt to add.

         \return  true if the closure_base-derived closure was added, false otherwise.

      */

      bool __fastcall add_work_to_batch(typename signalled_work_queue_type::value_type &&wk) noexcept(true) FORCE_INLINE {

         if (batch_empty()) {

            current_work=wk;

            return true;

         } else {

            return false;

         }

      }


      void __fastcall refill_batch(signalled_work_queue_type &signalled_work_queue) noexcept(false) FORCE_INLINE {

         reload_batch_if_empty_nochk_nosig(signalled_work_queue);

      }

      bool __fastcall process_a_batch_item() noexcept(false) FORCE_INLINE {

         while (!batch_empty()) {

            process_the_work(std::bind(&statistics_type::processed_hrz_work, &statistics_), cfg_type::hrz_edge_annotation);

         }

         return false;

      }

      void __fastcall process_a_batch(signalled_work_queue_type &signalled_work_queue) noexcept(false) FORCE_INLINE {

         refill_batch(signalled_work_queue);

         while (!batch_empty()) {

            process_the_work(std::bind(&statistics_type::processed_vertical_work, &statistics_), cfg_type::vertical_edge_annotation);

         }

      }


      statistics_type const &__fastcall statistics() const noexcept(true) FORCE_INLINE {

         return statistics_;

      }

      statistics_type &__fastcall statistics() noexcept(true) FORCE_INLINE {

         return statistics_;

      }


   private:

      statistics_type statistics_;

      typename signalled_work_queue_type::value_type current_work;

   };


   /// Interface for allowing an execution context to potentially execute work horizontally whilst the execution_context is held, so that we both ensure that we keep the cores busy, but also avoid deadlock due to resource starvation from a lack of available threads to process input_work (tasks) from the signalled_work_queue in the thread_pool_type.

   template<generic_traits::return_data RD, class TPB, template<class> class Del, template<class> class AtCtr>

   class horizontal_execution_itf {

   public:

      typedef TPB thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;   ///< The pool traits.

      typedef typename thread_pool_type::os_traits os_traits;

      typedef typename thread_pool_type::pool_type pool_type;

      typedef typename os_traits::lock_traits::anon_event_type atomic_t;

      typedef typename os_traits::exception_type exception_type;

      typedef closure::thread_wk_async_t<RD, os_traits, Del, AtCtr, typename pool_traits_type::cfg_type> thread_wk_t;

      typedef typename thread_wk_t::work_complete_t work_complete_t; ///< This atomic object is the object that is used to signal to a waiting future that the work has been completed.

      static constexpr generic_traits::return_data result_traits_=RD;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         thread_pool_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && os_traits::lock_traits::critical_section_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && atomic_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && os_traits::lock_traits::anon_event_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && thread_wk_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      horizontal_execution_itf(horizontal_execution_itf const &)=delete;


   protected:

      constexpr horizontal_execution_itf() noexcept(true) FORCE_INLINE {}

      /// Can't be used polymorphically - to maintain the concept that this is a stack allocated object.

      ~horizontal_execution_itf() noexcept(true) FORCE_INLINE {}


      virtual work_complete_t &__fastcall work_complete() noexcept(true)=0;

      virtual work_complete_t &__fastcall work_complete() const noexcept(true)=0;


      virtual thread_wk_t const & __fastcall core_work() const noexcept(true)=0;


      /// Check to see if the work has been completed.

      /**

         This is a non-blocking call. Note that if this is used unwisely, race-conditions or deadlocks will occur in the users' code.


         \return  If the work is joinable, returns true if the work has been executed by a thread in the pool, otherwise in all other cases returns false.

      */

      bool __fastcall work_done() const noexcept(true) FORCE_INLINE {

         return this->work_complete().try_lock()==os_traits::lock_traits::atom_set;

      }


      /**

         By default no horizontal execution is performed, we just wait for the closure_base-derived closure to be process()ed.

      */

      virtual void __fastcall wait_or_horizontal_thread() const noexcept(false) FORCE_INLINE {

         this->work_complete().lock();

      }

   };


   template<class TPB, template<class> class Del, template<class> class AtCtr>

   class eraseable_execution_context_base : virtual public horizontal_execution_itf<generic_traits::return_data::joinable, TPB, Del, AtCtr>, protected non_newable, protected non_addressable {

   public:

      typedef horizontal_execution_itf<generic_traits::return_data::joinable, TPB, Del, AtCtr> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type; ///< The pool traits.

      typedef typename base_t::os_traits os_traits;

      typedef typename base_t::pool_type pool_type;

      typedef typename base_t::atomic_t atomic_t;

      typedef typename base_t::exception_type exception_type;

      typedef typename base_t::thread_wk_t thread_wk_t;

      using signalled_work_queue_type=typename pool_traits_type::template signalled_work_queue_type<typename thread_pool_type::work_distribution_mode::queue_model>;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=base_t::memory_access_mode;


      /// Erase the thread_wk_t item from the queue in the thread_pool, if it is still in there.

      /**

         Note that the queue_size() of the thread_pool is not guaranteed to have been reduced by one until the execution_context has been waited upon, either explicitly (via dereference) or implicitly (via destruction). (Amongst other reasons, this is because a worker-thread might have already removed the item from the queue to mutate it.)


         \see thread_pool_type::queue_size()

      */

      bool __fastcall erase() noexcept(false) FORCE_INLINE {

         const typename os_traits::lock_traits::critical_section_type::write_lock_type lock(erase_lock, os_traits::lock_traits::infinite_timeout());

         if (waiting.try_lock()==os_traits::lock_traits::atom_unset) {

            erased.set();

            return false;

         } else {

            return true;

         }

      }


      /// Ensure that if an execution context is passed to another function, only a constant version may be passed.

      /**

         This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. i.e. only one thread can write to the results, but many can read. Because multiple read operations do not require locking with respect to each other. Note that if the results are accessed, then that call will block, as necessary, until the results are written, thus ensuring that all reads follow any writes to the contained data.

      */

      virtual const eraseable_execution_context_base * __fastcall operator&() const noexcept(true)=0;


   protected:

      eraseable_execution_context_base() noexcept(false) FORCE_INLINE

      : waiting(os_traits::lock_traits::atom_unset), erased(os_traits::lock_traits::atom_unset) {

      }


      /// Can't be used polymorphically - to maintain the concept that this is a stack allocated object.

      ~eraseable_execution_context_base() noexcept(false) FORCE_INLINE {}


      /// A counted reference to the item of work that has been transferred to the pool for execution.

      const typename signalled_work_queue_type::value_type & __fastcall

      wk_queue_item() const noexcept(true)=delete;


      /// A counted reference to the item of work that has been transferred to the pool for execution.

      typename signalled_work_queue_type::value_type & __fastcall

      wk_queue_item() noexcept(true)=delete;


      /**

         \return  True if the work has not been erased from the queue, false otherwise.

      */

      bool __fastcall has_work() const noexcept(true) FORCE_INLINE {

         bool work_in_queue;

         {

            const typename os_traits::lock_traits::critical_section_type::read_lock_type e_lock(erase_lock, os_traits::lock_traits::infinite_timeout());

            work_in_queue=(erased.try_lock()==os_traits::lock_traits::atom_unset);

            if (work_in_queue) {

               waiting.set();

            }

         }

         return work_in_queue;

      }


   private:

      mutable typename os_traits::lock_traits::critical_section_type erase_lock;

      mutable atomic_t waiting;

      mutable typename os_traits::lock_traits::anon_event_type erased;


      template<class ExCxt> friend class call_push_back;

   };


   /// The execution_context may need to avoid dereferencing if the result_type it contains is void.

   /**

      An ugly, evil, hack: this works around the fact that the standard doesn't allow references to void,only pointers, and the result_type of some of the parallel algorithms is void.

   */

   template<class Ret>

   struct add_ref_if_not_void {

      typedef Ret & type;


      template<class CW>

      static constexpr type FORCE_INLINE

      execute(CW &core_work_) noexcept(true) {

         return static_cast<type>(core_work_.closure().get_results());

      }

      template<class CW>

      static constexpr type FORCE_INLINE

      execute(CW const &core_work_) noexcept(true) {

         return static_cast<type>(core_work_.closure().get_results());

      }

   };

   template<>

   struct add_ref_if_not_void<void> {

      typedef void type;


      template<class CW>

      static constexpr type FORCE_INLINE

      execute(CW &) noexcept(true) {

      }

   };

   template<>

   struct add_ref_if_not_void<void const> {

      typedef void type;


      template<class CW>

      static constexpr type FORCE_INLINE

      execute(CW &) noexcept(true) {

      }

   };


   /// The execution_context may need extra dereferencing according to the result_type it contains.

   namespace deref {


      /// Just dereference the execution_context as normal.

      template<class ExCxt>

      struct noop {

         typedef ExCxt excution_context;

         typedef typename add_ref_if_not_void<typename excution_context::result_type const>::type const_ref_result_type;

         typedef typename add_ref_if_not_void<typename excution_context::result_type>::type ref_result_type;

         typedef typename excution_context::result_type const * const_addr_result_type;

         typedef typename excution_context::result_type * addr_result_type;


         static constexpr const_ref_result_type deref(excution_context const &e) noexcept(false) FORCE_INLINE {

            return e.get_results();

         }

         static constexpr ref_result_type deref(excution_context &e) noexcept(false) FORCE_INLINE {

            return e.get_results();

         }

         static constexpr const_addr_result_type arrow(excution_context const &e) noexcept(false) FORCE_INLINE {

            return &e.get_results();

         }

         static constexpr addr_result_type arrow(excution_context &e) noexcept(true) FORCE_INLINE {

            return &e.get_results();

         }

      };


      /// Ensure that when the execution_context is dereferenced to obtain the result any extra dereferences are done as necessary, for example if it is a boolean.

      template<class ExCxt>

      struct extra {

         typedef ExCxt excution_context;

         typedef typename excution_context::result_type::value_type const & const_ref_result_type;

         typedef typename excution_context::result_type::value_type & ref_result_type;

         typedef typename excution_context::result_type::value_type const * const_addr_result_type;

         typedef typename excution_context::result_type::value_type * addr_result_type;


         static constexpr const_ref_result_type __fastcall deref(excution_context const &e) noexcept(false) FORCE_INLINE {

            return e.get_results();

         }

         static constexpr ref_result_type __fastcall deref(excution_context &e) noexcept(false) FORCE_INLINE {

            return e.get_results();

         }

         static constexpr const_addr_result_type __fastcall arrow(excution_context const &e) noexcept(false) FORCE_INLINE {

            return &e.get_results();

         }

         static constexpr addr_result_type __fastcall arrow(excution_context &e) noexcept(false) FORCE_INLINE {

            return &e.get_results();

         }

      };


      /// Ensure that when the execution_context is dereferenced to obtain the result any extra dereferences are done as necessary, for example if it is a count.

      template<class ExCxt>

      struct extra_deref {

         typedef ExCxt excution_context;

         typedef typename excution_context::result_type::value_type const_ref_result_type;

         typedef typename excution_context::result_type::value_type ref_result_type;

         typedef typename excution_context::result_type::value_type const_addr_result_type;

         typedef typename excution_context::result_type::value_type addr_result_type;


         static constexpr const_ref_result_type __fastcall deref(excution_context const &e) noexcept(false) FORCE_INLINE {

            return e.get_results().get();

         }

         static constexpr ref_result_type __fastcall deref(excution_context &e) noexcept(false) FORCE_INLINE {

            return e.get_results().get();

         }

         static constexpr const_addr_result_type __fastcall arrow(excution_context const &e) noexcept(false) FORCE_INLINE {

            return e.get_results().get();

         }

         static constexpr addr_result_type __fastcall arrow(excution_context &e) noexcept(false) FORCE_INLINE {

            return e.get_results().get();

         }

      };


   }


   namespace core_work_result {


      /// Don't initialise the result of the execution_context.

      template<class CoreWk>

      struct noop {

         typedef CoreWk thread_wk_t;


         static constexpr void init(thread_wk_t &) noexcept(true) FORCE_INLINE {

         }

      };


      /// Default initialise the result of the execution_context.

      template<class CoreWk>

      struct to_zero {

         typedef CoreWk thread_wk_t;


         static void init(thread_wk_t &core_work) noexcept(true) FORCE_INLINE {

            core_work.closure().get_results()=typename thread_wk_t::closure_t::result_type();

         }

      };


      /// Default initialise the boolean result of the execution_context to false.

      template<class CoreWk>

      struct to_false {

         typedef CoreWk thread_wk_t;


         static void init(thread_wk_t &core_work) noexcept(true) FORCE_INLINE {

            core_work.closure().get_results()=false;

         }

      };


      /// Initialise the result of the execution_context with the provided value from the input initialisation value from the core_work.

      template<class CoreWk>

      struct to_op {

         typedef CoreWk thread_wk_t;


         static void init(thread_wk_t &core_work) noexcept(true) FORCE_INLINE {

            core_work.closure().get_results()=typename thread_wk_t::closure_t::result_type(core_work.closure().input().init);

         }

      };


   }


   /// Class that implements the horizontal execution algorithm.

   /**

      It wait for work on the main thread_pool::signalled_work_queue at low priority (to try and ensure that the pool_threads are more likely to get the thread_wk_t), takes one item at a time, i.e. no batching, GSS(k), where k=1, and resultant work is placed back into the thread_pool::signalled_work_queue, to be available for all threads.


      Due to the PThreads API, which lacks wait for multiple objects, one has to create another thread to execute the work horizontally whilst the execution context is held. Then once the execution context is released, one has to cancel the thread, but be hyper-careful about when one can cancel the thread to ensure that not only is any work it is processing completed, but also that any C++ objects it is holding have their dtors correctly run. Yuck. The thread is created in the call to get_results(), only if the work has not been completed.


      \see get_results()

   */

   template<generic_traits::return_data RD, class TPB, template<class> class Del, template<class> class AtCtr>

   class horizontal_execution : virtual public horizontal_execution_itf<RD, TPB, Del, AtCtr> {

   protected:

      typedef horizontal_execution_itf<RD, TPB, Del, AtCtr> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type;

      typedef typename base_t::pool_type pool_type;

      typedef typename base_t::os_traits os_traits;

      typedef typename base_t::thread_wk_t thread_wk_t;


      horizontal_execution(thread_pool_type &p, thread_wk_t &core_work) noexcept(true) FORCE_INLINE

      : process(p, core_work.exception_thrown_in_thread(), os_traits::thread_traits::get_current_thread()), pool(p) {

      }

      ~horizontal_execution() noexcept(false) FORCE_INLINE {}


      /// Wait upon the core_work to complete process(), or process() items from the pool::batch_details.

      /**

         When items are directly process()ed from the pool::batch_details whilst waiting for the attaced core_work to complete, I term this as "horizontal threading", as opposed to "vertical threading", which is when core_work items are process()ed in pool_thread pool::thread_types::steal::process(), possibly being batched.

         This function will attempt to process core_work from the signalled_work_queue, or if the wk_queue_item() upon which it is waiting has been completed, will return. Batching is carefully done: if there are items in the pool::batch_details, then these will be processed first, but only from the same thread as this one.


         \see steal::batch_type, pool::batch_details

      */

      void __fastcall wait_or_horizontal_thread() const noexcept(false) final override {

         typename os_traits::lock_traits::atomic_state_type wk_complete;

         assert(dynamic_cast<thread_pool_type *>(&pool));

         // Process work in the batch, as we may be waiting for an item in the batch to complete before we are released.

         while ((wk_complete=this->work_complete().try_lock())!=os_traits::lock_traits::atom_set && pool.process_a_batch_item(os_traits::thread_traits::get_current_thread(), this->core_work().exception_thrown_in_thread()));

         if (wk_complete!=os_traits::lock_traits::atom_set) {

            // We need to perform the horizontal threading in a separate thread, because we don't have WaitOnMultipleObjects() in PThreads.

            try {

               const typename execute_any_work_horizontally::scoped sc(process, this->work_complete(), wk_complete);

               if (wk_complete!=os_traits::lock_traits::atom_set) {

                  // Wait for the closure_base-derived closure to be transformed, whilst any horizontal closure_base-derived closure transformations.

                  this->work_complete().lock();

               }

            } catch (...) {

               this->work_complete().lock();

               throw;

            }

         }

      }


   private:

      class execute_any_work_horizontally final : public wrapper<os_traits::thread_traits::api_params_type::api_type, typename os_traits::thread_traits::model_type> {

      public:

         typedef wrapper<os_traits::thread_traits::api_params_type::api_type, typename os_traits::thread_traits::model_type> base_t;

         typedef typename base_t::lock_traits lock_traits;

         typedef typename base_t::thread_context_t thread_context_t;

         using exit_requested_type=typename pool_traits_type::template exit_requested_type<typename thread_pool_type::work_distribution_mode::queue_model>;

         typedef typename base_t::exception_type exception_type;

         /**

            Don't force the thread waiting upon the just satisfied execution_context to have to also immediately synchronise with the horizontally executing thread, just request it to stop processing more work, and delay the forced synchronisation until the execution_context leaves scope, i.e. allow some greater opportunity for overlapping processing work.

         */

         class scoped final {

         public:

            scoped(execute_any_work_horizontally &e, typename thread_wk_t::work_complete_t &work_complete, typename os_traits::lock_traits::atomic_state_type &wk_complete) noexcept(false) FORCE_INLINE

            : thr(e) {

               // Try to allow the pool_threads a chance to process the work before this horizontal thread is created that might compete with the pool_threads for the work.

               os_traits::thread_traits::sleep(0);

               if ((wk_complete=work_complete.try_lock())!=os_traits::lock_traits::atom_set) {

                  thr.create_running();

               }

            }

            ~scoped() noexcept(true) FORCE_INLINE {

               thr.request_exit();

            }


         private:

            execute_any_work_horizontally &thr;

         };


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

            base_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            && thread_context_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            && exit_requested_type::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            ? ppd::generic_traits::memory_access_modes::crew_memory_access

            : ppd::generic_traits::memory_access_modes::erew_memory_access

         );


         execute_any_work_horizontally(thread_pool_type &p, typename os_traits::thread_exception const &ex_thr, const typename os_traits::thread_traits::api_params_type::handle_type ancestor_thr_id) noexcept(true) FORCE_INLINE

         : base_t(), hrz_work(), pool(p), exception_thrown_in_thread(ex_thr), ancestor_thread_id(ancestor_thr_id) {

            assert(dynamic_cast<thread_pool_type *>(&pool));

         }


         ~execute_any_work_horizontally() noexcept(false) FORCE_INLINE {

            wait_thread_exit();

         }


         void operator=(execute_any_work_horizontally const &)=delete;

         void operator=(execute_any_work_horizontally &&)=delete;


         void create_running() noexcept(false) override FORCE_INLINE {

            base_t::create_running();

            try {

               // We want the current horizontal thread to run on the same core as the thread that has been held, waiting for a result, i.e. it was an idle core, now to be used.

               this->kernel_affinity(os_traits::thread_traits::get_kernel_affinity(ancestor_thread_id));

               // Ensure that we prefer vertical to horizontal threading, otherwise we can accidentally get convoying of tasks together with (bizarrely) free threads, which seriously ruins scalability.

               orig_pri=this->kernel_priority();

               this->kernel_priority(os_traits::thread_traits::api_params_type::idle);

            } catch (exception_type const &ex) {

               // Ignore any library errors, it's not a nightmare if the above tweaks don't happen..

            }

         }


         void __fastcall request_exit() const noexcept(true) override FORCE_INLINE {

            try {

               base_t::request_exit();

            } catch (exception_type const &ex) {

               // Ensure that if we request that a thread should be cancelled, and fail, then we don't later attempt to join on it and potentially lock up in the thread_base::wait_thread_exit() when it might attempt to rejoin with it.

               const typename lock_traits::critical_section_type::write_lock_type lock(this->thread_params_lock, lock_traits::infinite_timeout());

               this->thread_params.id=0;

               this->thread_params.state=os_traits::thread_traits::api_params_type::failed_to_cancel;

               // Ignore any library errors, as it is telling us nothing that we can do anything about.

            }

         }


      private:

         /**

            Prevent excessive core-to-core chatter: accumulate statistics locally, then at the end send them back to the thread_pool. Note that this implies that local gathering can be lock-free.

            \todo Note that we don't "do" GSS(k) batching here, yet, because it greatly simplifies horizontal threading.

         */

         using hrz_work_type=batch_details<1, typename thread_pool_type::pool_traits_type::template signalled_work_queue_type<typename thread_pool_type::work_distribution_mode::queue_model>, typename thread_pool_type::statistics_type>;

         using setter_type=setter<typename os_traits::thread_traits, os_traits::thread_traits::api_params_type::idle>;


         hrz_work_type hrz_work;

         thread_pool_type &pool;

         typename os_traits::thread_exception const &exception_thrown_in_thread;

         const typename os_traits::thread_traits::api_params_type::handle_type ancestor_thread_id;

         typename thread_pool_type::priority_type orig_pri;


         bool __fastcall pre_exit() noexcept(false) override {

            if (!base_t::pre_exit()) {

               // Make sure we carefully control when this thread can be cancelled, to avoid nasty double-exceptions being thrown in client code.

               const typename os_traits::thread_traits::cancellability set;

               // We prefer vertical to horizontal threading, so try to prefer the former.

               os_traits::thread_traits::sleep(0);

               assert(dynamic_cast<thread_pool_type *>(&pool));

               const typename exit_requested_type::lock_result_type lkd=pool.exit_requested().lock();

               // Process work if available, then check for the exit flag (note that the exit flag takes priority in the signalled_work_queue), because we don't want random pool_threads to exit, causing resource starvation, because all remaining threads might be waiting for a thread_wk to be processed, and there may be no remaining threads left to do this, thus causing the library to lock up.

               if (lkd.first==exit_requested_type::states::new_work_arrived) {

                  assert(dynamic_cast<thread_pool_type *>(&pool));

                  return false;

               } else if (lkd.first==exit_requested_type::states::exit_requested) {

                  // Ensure the rest of the threads in the pool exit.

                  assert(dynamic_cast<thread_pool_type *>(&pool));

                  pool.exit_requested().set(exit_requested_type::states::exit_requested);

               }

            }

            return true;

         }


         void __fastcall wait_thread_exit() noexcept(false) FORCE_INLINE {

            // Cancel the thread, because we have no "WaitForMultipleObjects()" in PThreads.

            this->exit_requested=true;

            base_t::wait_thread_exit();

            assert(dynamic_cast<thread_pool_type *>(&pool));

            // Note that this may not be locked, so is not guaranteed to be thread-safe, so the result may be incorrect, but it is faster...!

            pool.set_statistics().add_hrz_work(hrz_work.statistics().total_hrz_work());

            pool.set_statistics().add_vertical_work(hrz_work.statistics().total_vertical_work());

         }


         bool __fastcall worker_fn(thread_context_t &) noexcept(false) override FORCE_INLINE {

            assert(dynamic_cast<thread_pool_type *>(&pool));

            hrz_work.refill_batch(pool.queue());

            const setter_type setter(this->params().id);

            // Process an item at a time to try and reduce the amount of closure_base-derived closure mutated in this horizontal thread, to give the vertical threads a better stab at the closure_base-derived closure.

            hrz_work.process_a_batch_item();

            return false;

         }

      };


   public:

      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         base_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && execute_any_work_horizontally::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


   private:

      mutable execute_any_work_horizontally process;


   protected:

      thread_pool_type &pool;

   };


   template<class DM, generic_traits::return_data RD, class TPB, class Wk>

   class execution_context_stack_type;


   /// Enforce a sequential-consistency memory-model on the result data that this object manages, via the accessors to the result data.

   /**

      The execution_context stores the thread_wk_type inside it, so it is allocated on the stack, not on the heap, which is only useful for classic, joinable, data-flow operations. This optimisation saves allocating the thread_wk_type on the heap, and also the shared_ptr in it can have a dummy sp_counter_type, noop_atomic_ctr, because the counter doesn't do anything, for a greater saving on atomic operations.

      No horizontal threading is done in this specialisation, because a master thread distributes the work.


      \see horizontal_execution

   */

   template<class TPB, class Wk>

   class execution_context_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, TPB, Wk> : public eraseable_execution_context_base<TPB, noop_dtor, TPB::os_traits::lock_traits::template noop_atomic_ctr> {

   public:

      typedef eraseable_execution_context_base<TPB, noop_dtor, TPB::os_traits::lock_traits::template noop_atomic_ctr> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type; ///< The pool traits.

      typedef typename base_t::os_traits os_traits;

      typedef typename os_traits::lock_traits lock_traits;

      typedef typename base_t::pool_type pool_type;

      typedef typename base_t::atomic_t atomic_t;

      typedef typename base_t::exception_type exception_type;

      typedef typename thread_pool_type::template create_direct<Wk> creator_t;

      typedef typename creator_t::result_type result_type;

      using signalled_work_queue_type=typename pool_traits_type::template signalled_work_queue_type<typename thread_pool_type::work_distribution_mode::queue_model>;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::joinable;


   private:

      class stack_exec_ctx_helper {

         using process_fn_traits=typename creator_t::process_fn_traits;

         using closure_t=typename creator_t::closure_t;


      public:

         /**

            Note how we're pretty damn tricky here, and make use of both the dtor and the counter interfaces, using no-op implementations, because the object is going to be allocated on the stack, we manage the memory ourselves, normally. So we save on avoiding heap allocation deallocation, but also the atomic-operations that would be required in the counter for the shared_ptr.

         */

         typedef typename pool_traits_type::template thread_wk<result_traits_, closure_t, typename thread_pool_type::pool_thread_queue_details::work_complete_t, noop_dtor, lock_traits::template noop_atomic_ctr> thread_wk_t;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=thread_wk_t::memory_access_mode;

      };


   public:

      typedef typename stack_exec_ctx_helper::thread_wk_t thread_wk_t;

      typedef typename thread_wk_t::work_complete_t work_complete_t;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         base_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && stack_exec_ctx_helper::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && work_complete_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      execution_context_stack_type(thread_pool_type &pool, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&wk) noexcept(false) FORCE_INLINE

      : base_t(), work_complete_(lock_traits::atom_unset), core_work_(work_complete_, std::forward<typename thread_wk_t::closure_t::argument_type>(wk), p) {

         core_work_.update_edge(thread_pool_type::cfg_type::vertical_edge_annotation);

         pool.add_joinable_work(typename signalled_work_queue_type::value_type(&core_work_));

      }


      /**

         This needs to be declared, to be standards compliant, but needn't be defined, as cctor elision doesn't require the definition.

      */

      execution_context_stack_type(execution_context_stack_type const &);

      execution_context_stack_type(execution_context_stack_type &&);


      /**

         In case the user didn't specifically call wait(), operator*() or operator->() for some reason, throw any registered exception. (I don't throw the exception in the thread's destructor in the thread pool, as this is too late, and makes evil memory leaks in the thread pool destructor.)

      */

      ~execution_context_stack_type() noexcept(false) FORCE_INLINE {

         assert(dynamic_cast<thread_wk_t *>(&core_work_));

         core_work_.throw_any_exception();

      }


      /// Can't automatically convert to a base-class address automatically - to maintain the concept that this is a stack allocated object.

      void operator&()=delete;

      /// Attempt to remove the ability to subvert the safety by incorrectly casting the execution_context.

      template<class T> operator T () const=delete;

      /// Attempt to remove the ability to subvert the safety by incorrectly casting the execution_context.

      template<class T> operator T ()=delete;

      void operator=(execution_context_stack_type const &)=delete;

      void operator=(execution_context_stack_type &&)=delete;


      /// A (potentially blocking) access to the results, but only after they are written.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations. Throws if the work has been previously erased.


         \see work_done()

      */

      typename add_ref_if_not_void<result_type const>::type __fastcall operator*() const noexcept(false) FORCE_INLINE {

         return this->get_results();

      }

      /// A (potentially blocking) access to the results, but only after they are written.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows write access to the results (in the current stack-frame, i.e. thread), but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. Throws if the work has been previously erased.


         \see work_done()

      */

      typename add_ref_if_not_void<result_type>::type __fastcall operator*() noexcept(false) FORCE_INLINE {

         return this->get_results();

      }


      /// A (potentially blocking) access to the results, but only after they are written, or process other work from the signalled_work_queue or batch whilst wait for the core_work to be processed.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. (This operator has been provided to allow chaining of "operator->()"s by the compiler.) Throws if the work has been previously erased.


         \see work_done(), get_results(), wait_or_horizontal_thread(), pool::batch_details

      */

      result_type const * __fastcall operator->() const noexcept(false) FORCE_INLINE {

         return &this->get_results();

      }

      /// A (potentially blocking) access to the results, but only after they are written, or process other work from the signalled_work_queue or batch whilst wait for the core_work to be processed.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows write access to the results (in the current stack-frame, i.e. thread), but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. (This operator has been provided to allow chaining of "operator->()"s by the compiler.) Throws if the work has been previously erased.


         \see work_done(), get_results(), wait_or_horizontal_thread(), pool::batch_details

      */

      result_type * __fastcall operator->() noexcept(false) FORCE_INLINE {

         return &this->get_results();

      }


      /// Ensure that if an execution context is passed to another function, only a constant version may be passed.

      /**

         This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. i.e. only one thread can write to the results, but many can read. Because multiple read operations do not require locking with respect to each other. Note that if the results are accessed, then that call will block, as necessary, until the results are written, thus ensuring that all reads follow any writes to the contained data.

      */

      const execution_context_stack_type * __fastcall operator&() const noexcept(true) override FORCE_INLINE {

         return this;

      }


      /// A counted reference to the item of work that has been transferred to the pool for execution.

      const typename signalled_work_queue_type::value_type __fastcall

      wk_queue_item() const noexcept(true) FORCE_INLINE {

         return typename signalled_work_queue_type::value_type(&core_work_);

      }


      /// A counted reference to the item of work that has been transferred to the pool for execution.

      typename signalled_work_queue_type::value_type __fastcall

      wk_queue_item() noexcept(true) FORCE_INLINE {

         return typename signalled_work_queue_type::value_type(&core_work_);

      }


   protected:

      mutable work_complete_t work_complete_;

      mutable thread_wk_t core_work_;


      thread_wk_t const & __fastcall core_work() const noexcept(true) final override FORCE_INLINE {

         return core_work_;

      }

      work_complete_t &__fastcall work_complete() noexcept(true) final override FORCE_INLINE {

         return work_complete_;

      }

      work_complete_t &__fastcall work_complete() const noexcept(true) final override FORCE_INLINE {

         return work_complete_;

      }


      typename add_ref_if_not_void<result_type const>::type __fastcall get_results() const noexcept(false) FORCE_INLINE {

         if (this->has_work()) {

            this->wait_or_horizontal_thread();

            // Right - we now check for any uncaught (by the user type, that is) exceptions that were caught by the thread class, and throw it.

            core_work_.throw_any_exception();

            return add_ref_if_not_void<result_type const>::execute(core_work_);

         } else {

            throw exception_type(_T("No results: work previously erased from the pool."), info::function(__LINE__, __PRETTY_FUNCTION__, typeid(*this)), JMMCG_REVISION_HDR(_T(LIBJMMCG_VERSION_NUMBER)));

         }

      }


      typename add_ref_if_not_void<result_type>::type __fastcall get_results() noexcept(false) FORCE_INLINE {

         if (this->has_work()) {

            this->wait_or_horizontal_thread();

            // Right - we now check for any uncaught (by the user type, that is) exceptions that were caught by the thread class, and throw it.

            core_work_.throw_any_exception();

            return add_ref_if_not_void<result_type>::execute(core_work_);

         } else {

            throw exception_type(_T("No results: work previously erased from the pool."), info::function(__LINE__, __PRETTY_FUNCTION__, typeid(*this)), JMMCG_REVISION_HDR(_T(LIBJMMCG_VERSION_NUMBER)));

         }

      }

   };


   /// Enforce a sequential-consistency memory-model on the result data that this object manages, via the accessors to the result data, but also allow horizontal threading: if the object being managed has not yet had its result computed, and the queue in the thread_pool is not empty, process an item from that queue in the mean-time.

   /**

      This ensures that resource starvation (of threads) cannot occur, as no longer does waiting upon a dereference of an execution context block that thread, as that thread can process other work in the mean-time, which is important for finite-sized thread_pools. This specialisation performs horizontal threading, and the horizontal thread is also created on the stack, so with cache locality and all, quite good at stealing work.

      The execution_context stores the thread_wk_type inside it, so it is allocated on the stack, not on the heap, which is only useful for classic, joinable, data-flow operations. This optimisation saves allocating the thread_wk_type on the heap, and also the shared_ptr in it can have a dummy sp_counter_type, because the counter doesn't do anything, for a greater saving on atomic operations.


      \see horizontal_execution

   */

   template<class TPB, class Wk>

   class execution_context_stack_type<pool_traits::work_distribution_mode_t::worker_threads_get_work<typename TPB::work_distribution_mode::queue_model>, generic_traits::return_data::joinable, TPB, Wk> : public execution_context_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, TPB, Wk>, public horizontal_execution<generic_traits::return_data::joinable, TPB, noop_dtor, TPB::os_traits::lock_traits::template noop_atomic_ctr> {

   public:

      typedef execution_context_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, TPB, Wk> base_t;

      typedef horizontal_execution<generic_traits::return_data::joinable, TPB, noop_dtor, TPB::os_traits::lock_traits::template noop_atomic_ctr> base2_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type;

      typedef typename base_t::os_traits os_traits;

      typedef typename base_t::pool_type pool_type;

      typedef typename base_t::atomic_t atomic_t;

      typedef typename base_t::exception_type exception_type;

      typedef typename base_t::creator_t creator_t;

      typedef typename base_t::result_type result_type;

      typedef typename base_t::thread_wk_t thread_wk_t;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::joinable;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         base_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && base2_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      execution_context_stack_type(thread_pool_type &pl, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&wk) noexcept(false) FORCE_INLINE

      : base_t(pl, p, std::forward<typename thread_wk_t::closure_t::argument_type>(wk)), base2_t(pl, this->core_work_) {

      }


      /**

         This needs to be declared, to be standards compliant, but needn't be defined, as cctor elision doesn't require the definition.

      */

      execution_context_stack_type(execution_context_stack_type const &);

      execution_context_stack_type(execution_context_stack_type &&);

      ~execution_context_stack_type() noexcept(false) {}


      /// Can't automatically convert to a base-class address automatically - to maintain the concept that this is a stack allocated object.

      void operator&()=delete;

      /// Attempt to remove the ability to subvert the safety by incorrectly casting the execution_context.

      template<class T> operator T () const=delete;

      /// Attempt to remove the ability to subvert the safety by incorrectly casting the execution_context.

      template<class T> operator T ()=delete;

      void operator=(execution_context_stack_type const &)=delete;

      void operator=(execution_context_stack_type &&)=delete;


   private:

      template<class ExCxt> friend class call_push_back;

   };


   /// Ensure that the compiler emits an error if attempting to non-joinably create an execution context.

   /**

      This class is not supposed to be constructible. It is just here to allow the compiler to compile the code. Non-joinable transfers to a thread pool never create an execution context. And transfers to a non-joinable thread pool also never create an execution context.

   */

   template<class DM, class TPB, class Wk>

   class execution_context_stack_type<DM, generic_traits::return_data::nonjoinable, TPB, Wk> final : protected non_allocatable {

   public:

      typedef TPB thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;

      typedef typename thread_pool_type::pool_type pool_type;

      typedef typename pool_traits_type::os_traits os_traits;

      typedef typename os_traits::exception_type exception_type;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::nonjoinable;


      execution_context_stack_type()=delete;

   };


   template<class DM, generic_traits::return_data RD, class TPB, template<class, class, template<class> class, template<class> class> class CoreWk, class AlgoWrapT, class Wk>

   class execution_context_algo_stack_type;


   /// Enforce a sequential-consistency memory-model on the result data that this object manages, via the accessors to the result data.

   /**

      The execution_context stores the thread_wk_t and algo_thread_wk inside it, so it is allocated on the stack, not on the heap, which is only useful for classic, joinable, data-flow operations. This optimisation saves allocating the thread_wk_t and algo_thread_wk on the heap, and also the shared_ptr in it can have a dummy sp_counter_type, noop_atomic_ctr, because the counter doesn't do anything, for a greater saving on atomic operations.

      No horizontal threading is done in this specialisation, because a master thread distributes the work.


      \see horizontal_execution

   */

   template<class TPB, template<class, class, template<class> class, template<class> class> class CoreWk, class AlgoWrapT, class Wk>

   class execution_context_algo_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, TPB, CoreWk, AlgoWrapT, Wk> : public eraseable_execution_context_base<TPB, noop_dtor, TPB::os_traits::lock_traits::template noop_atomic_ctr> {

   public:

      typedef eraseable_execution_context_base<TPB, noop_dtor, TPB::os_traits::lock_traits::template noop_atomic_ctr> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type;

      typedef typename base_t::os_traits os_traits;

      typedef typename base_t::pool_type pool_type;

      typedef typename base_t::atomic_t atomic_t;

      typedef typename base_t::exception_type exception_type;

      typedef typename thread_pool_type::template create_direct<Wk> creator_t;

      typedef typename creator_t::result_type result_type;

      typedef AlgoWrapT alg_wrap_t;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::joinable;


   private:

      class stack_exec_ctx_helper final {

         typedef typename thread_pool_type::template create_direct<alg_wrap_t> alg_wrap_creator_t;


         using process_fn_traits=typename creator_t::process_fn_traits;

         using closure_t=typename creator_t::closure_t;


      public:

         typedef typename pool_traits_type::template thread_wk<result_traits_, typename alg_wrap_creator_t::closure_t, typename os_traits::lock_traits::anon_event_type, placement_dtor, os_traits::lock_traits::template atomic_counter_type> alg_wrap_thread_wk_t;


         /**

            Note how we're pretty damn tricky here, and make use of both the dtor and the counter interfaces, using no-op implementations, because the object is going to be allocated on the stack, we manage the memory ourselves, normally. So we save on avoiding heap allocation/deallocation, but also the atomic-operations that would be required in the counter. for the shared_ptr.

         */

         typedef CoreWk<closure_t, typename alg_wrap_t::work_complete_t, noop_dtor, os_traits::lock_traits::template noop_atomic_ctr> thread_wk_t;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

            alg_wrap_thread_wk_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            && thread_wk_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

            ? ppd::generic_traits::memory_access_modes::crew_memory_access

            : ppd::generic_traits::memory_access_modes::erew_memory_access

         );

      };


      void add_work(const typename thread_pool_type::pool_type::size_type clique) noexcept(false) FORCE_INLINE {

         BOOST_STATIC_ASSERT(sizeof(execution_context_algo_stack_type)<=os_traits::thread_traits::api_params_type::max_stack_size);

         core_work_.update_edge(thread_pool_type::cfg_type::sequential_edge_annotation);

         alg_wrap_t leaf_wk(

            typename alg_wrap_t::work_wrap(

               work_complete_.containers(),

               core_work_.closure(),

               clique

            ),

            work_complete_

         );

         leaf_wk.process();

      }


   public:

      typedef typename stack_exec_ctx_helper::thread_wk_t thread_wk_t;

      typedef typename thread_wk_t::work_complete_t work_complete_t;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         base_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && alg_wrap_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && stack_exec_ctx_helper::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && work_complete_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      execution_context_algo_stack_type(thread_pool_type &, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&tw, typename work_complete_t::num_tasks_spawned_t::value_type const i, typename work_complete_t::containers_type const &c, const typename thread_pool_type::pool_type::size_type clique) noexcept(false) FORCE_INLINE

      : base_t(), work_complete_(i, c), core_work_(work_complete_, std::forward<typename thread_wk_t::closure_t::argument_type>(tw), p) {

         // Ensure any resizing of the output collection is done whilst the inputs & output collections are locked to avoid the inputs being resized in the mean-time.

         Wk::resize_output(core_work_);

         add_work(clique);

      }

      execution_context_algo_stack_type(thread_pool_type &, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&tw, typename work_complete_t::num_tasks_spawned_t::value_type const i, typename work_complete_t::containers_type const &c, const typename thread_pool_type::pool_type::size_type clique, typename work_complete_t::containers_type::size_type const out_colln_size) noexcept(false) FORCE_INLINE

      : base_t(), work_complete_(i, c), core_work_(work_complete_, std::forward<typename thread_wk_t::closure_t::argument_type>(tw), p) {

         core_work_.resize_output(out_colln_size);

         add_work(clique);

      }


      /**

         This needs to be declared, to be standards compliant, but needn't be defined, as cctor elision doesn't require the definition.

      */

      execution_context_algo_stack_type(execution_context_algo_stack_type const &);

      execution_context_algo_stack_type(execution_context_algo_stack_type &&);


      /**

         In case the user didn't specifically call wait(), operator*() or operator->() for some reason, throw any registered exception. (I don't throw the exception in the thread's destructor in the thread pool, as this is too late, and makes evil memory leaks in the thread pool destructor.)

      */

      ~execution_context_algo_stack_type() noexcept(false) FORCE_INLINE {

         assert(dynamic_cast<thread_wk_t *>(&core_work_));

         core_work_.throw_any_exception();

      }


      /// Can't automatically convert to a base-class address automatically - to maintain the concept that this is a stack allocated object.

      void operator&()=delete;

      /// Attempt to remove the ability to subvert the safety by incorrectly casting the execution_context.

      template<class T> operator T () const=delete;

      /// Attempt to remove the ability to subvert the safety by incorrectly casting the execution_context.

      template<class T> operator T ()=delete;

      void operator=(execution_context_algo_stack_type const &)=delete;

      void operator=(execution_context_algo_stack_type &&)=delete;


      /// A (potentially blocking) access to the results, but only after they are written.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations. Throws if the work has been previously erased.


         \see work_done()

      */

      typename add_ref_if_not_void<result_type const>::type __fastcall operator*() const noexcept(false) FORCE_INLINE {

         return this->get_results();

      }

      /// A (potentially blocking) access to the results, but only after they are written.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows write access to the results (in the current stack-frame, i.e. thread), but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. Throws if the work has been previously erased.


         \see work_done()

      */

      typename add_ref_if_not_void<result_type>::type __fastcall operator*() noexcept(false) FORCE_INLINE {

         return this->get_results();

      }


      /// A (potentially blocking) access to the results, but only after they are written, or process other work from the signalled_work_queue or batch whilst wait for the core_work to be processed.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. (This operator has been provided to allow chaining of "operator->()"s by the compiler.) Throws if the work has been previously erased.


         \see work_done(), get_results(), wait_or_horizontal_thread(), pool::batch_details

      */

      result_type const * __fastcall operator->() const noexcept(false) FORCE_INLINE {

         return &this->get_results();

      }

      /// A (potentially blocking) access to the results, but only after they are written, or process other work from the signalled_work_queue or batch whilst wait for the core_work to be processed.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows write access to the results (in the current stack-frame, i.e. thread), but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. (This operator has been provided to allow chaining of "operator->()"s by the compiler.) Throws if the work has been previously erased.


         \see work_done(), get_results(), wait_or_horizontal_thread(), pool::batch_details

      */

      result_type * __fastcall operator->() noexcept(false) FORCE_INLINE {

         return &this->get_results();

      }


      /// Ensure that if an execution context is passed to another function, only a constant version may be passed.

      /**

         This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. i.e. only one thread can write to the results, but many can read. Because multiple read operations do not require locking with respect to each other. Note that if the results are accessed, then that call will block, as necessary, until the results are written, thus ensuring that all reads follow any writes to the contained data.

      */

      const execution_context_algo_stack_type * __fastcall operator&() const noexcept(true) override FORCE_INLINE {

         return this;

      }


   protected:

      mutable work_complete_t work_complete_;

      thread_wk_t core_work_;


      thread_wk_t const & __fastcall core_work() const noexcept(true) final override FORCE_INLINE {

         return core_work_;

      }

      work_complete_t &__fastcall work_complete() noexcept(true) final override FORCE_INLINE {

         return work_complete_;

      }

      work_complete_t &__fastcall work_complete() const noexcept(true) final override FORCE_INLINE {

         return work_complete_;

      }


   private:

      template<class ExCxt> friend class call_push_back;


      typename add_ref_if_not_void<result_type const>::type __fastcall get_results() const noexcept(false) FORCE_INLINE {

         if (this->has_work()) {

            this->wait_or_horizontal_thread();

            // Right - we now check for any uncaught (by the user type, that is) exceptions that were caught by the thread class, and throw it.

            core_work_.throw_any_exception();

            return add_ref_if_not_void<result_type const>::execute(core_work_);

         } else {

            throw exception_type(_T("No results: work previously erased from the pool."), info::function(__LINE__, __PRETTY_FUNCTION__, typeid(*this)), JMMCG_REVISION_HDR(_T(LIBJMMCG_VERSION_NUMBER)));

         }

      }


      typename add_ref_if_not_void<result_type>::type __fastcall get_results() noexcept(false) FORCE_INLINE {

         if (this->has_work()) {

            this->wait_or_horizontal_thread();

            // Right - we now check for any uncaught (by the user type, that is) exceptions that were caught by the thread class, and throw it.

            core_work_.throw_any_exception();

            return add_ref_if_not_void<result_type>::execute(core_work_);

         } else {

            throw exception_type(_T("No results: work previously erased from the pool."), info::function(__LINE__, __PRETTY_FUNCTION__, typeid(*this)), JMMCG_REVISION_HDR(_T(LIBJMMCG_VERSION_NUMBER)));

         }

      }

   };


   /// Enforce a sequential-consistency memory-model on the result data that this object manages, via the accessors to the result data, but also allow horizontal threading: if the object being managed has not yet had its result computed, and the queue in the thread_pool is not empty, process an item from that queue in the mean-time.

   /**

      This ensures that resource starvation (of threads) cannot occur, as no longer does waiting upon a dereference of an execution context block that thread, as that thread can process other work in the mean-time, which is important for finite-sized thread_pools. This specialisation performs horizontal threading, and the horizontal thread is also created on the stack, so with cache locality and all, quite good at stealing work.

      The execution_context stores the algo_thread_wk inside it, so it is allocated on the stack, not on the heap, which is only useful for classic, joinable, data-flow operations. This optimisation saves allocating the algo_thread_wk on the heap, and also the shared_ptr in it can have a dummy sp_counter_type, because the counter doesn't do anything, for a greater saving on atomic operations.


      \see horizontal_execution

   */

   template<class TPB, template<class, class, template<class> class, template<class> class> class CoreWk, class AlgoWrapT, class Wk>

   class execution_context_algo_stack_type<pool_traits::work_distribution_mode_t::worker_threads_get_work<typename TPB::work_distribution_mode::queue_model>, generic_traits::return_data::joinable, TPB, CoreWk, AlgoWrapT, Wk> final : public execution_context_algo_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, TPB, CoreWk, AlgoWrapT, Wk>, public horizontal_execution<generic_traits::return_data::joinable, TPB, noop_dtor, TPB::os_traits::lock_traits::template noop_atomic_ctr> {

   public:

      typedef execution_context_algo_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, TPB, CoreWk, AlgoWrapT, Wk> base_t;

      typedef horizontal_execution<generic_traits::return_data::joinable, TPB, noop_dtor, TPB::os_traits::lock_traits::template noop_atomic_ctr> base2_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;   ///< The pool traits.

      typedef typename thread_pool_type::os_traits os_traits;

      typedef typename thread_pool_type::pool_type pool_type;

      typedef typename os_traits::lock_traits::anon_event_type atomic_t;

      typedef typename os_traits::exception_type exception_type;

      typedef typename base_t::thread_wk_t thread_wk_t;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::joinable;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         base_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && base2_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      execution_context_algo_stack_type(thread_pool_type &pool, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&tw, typename thread_wk_t::work_complete_t::num_tasks_spawned_t::value_type const i, typename thread_wk_t::work_complete_t::containers_type const &c, const typename thread_pool_type::pool_type::size_type clique) noexcept(false) FORCE_INLINE

      : base_t(pool, p, std::forward<typename thread_wk_t::closure_t::argument_type>(tw), i, c, clique), base2_t(pool, this->core_work_) {

         BOOST_STATIC_ASSERT(sizeof(execution_context_algo_stack_type)<=os_traits::thread_traits::api_params_type::max_stack_size);

      }

      execution_context_algo_stack_type(thread_pool_type &pool, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&tw, typename thread_wk_t::work_complete_t::num_tasks_spawned_t::value_type const i, typename thread_wk_t::work_complete_t::containers_type const &c, const typename thread_pool_type::pool_type::size_type clique, typename thread_wk_t::work_complete_t::containers_type::size_type const out_colln_size) noexcept(false) FORCE_INLINE

      : base_t(pool, p, std::forward<typename thread_wk_t::closure_t::argument_type>(tw), i, c, clique, out_colln_size), base2_t(pool, this->core_work_) {

         BOOST_STATIC_ASSERT(sizeof(execution_context_algo_stack_type)<=os_traits::thread_traits::api_params_type::max_stack_size);

      }


      /**

         This needs to be declared, to be standards compliant, but needn't be defined, as cctor elision doesn't require the definition.

      */

      execution_context_algo_stack_type(execution_context_algo_stack_type const &);

      execution_context_algo_stack_type(execution_context_algo_stack_type &&);

      ~execution_context_algo_stack_type() FORCE_INLINE {

      }


      void operator=(execution_context_algo_stack_type const &)=delete;

      void operator=(execution_context_algo_stack_type &&)=delete;


   private:

      template<class ExCxt> friend class call_push_back;

   };


   /// Ensure that the compiler emits an error if attempting to non-joinably create an execution context.

   /**

      This class is not supposed to be constructible. It is just here to allow the compiler to compile the code. Non-joinable transfers to a thread pool never create an execution context. And transfers to a non-joinable thread pool also never create an execution context.

   */

   template<class DM, class TPB, template<class, class, template<class> class, template<class> class> class CoreWk, class AlgoWrapT, class Wk>

   class execution_context_algo_stack_type<DM, generic_traits::return_data::nonjoinable, TPB, CoreWk, AlgoWrapT, Wk> final : protected non_allocatable {

   public:

      typedef TPB thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;

      typedef typename thread_pool_type::pool_type pool_type;

      typedef typename pool_traits_type::os_traits os_traits;

      typedef typename os_traits::exception_type exception_type;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::nonjoinable;


      execution_context_algo_stack_type()=delete;

   };


   template<class DM, generic_traits::return_data RD, template<class, class, class, template<class> class, template<class> class> class AlgCoreWk, class GenWk, class Wk, template<class> class Deref=deref::noop, template<class> class InitCoreWk=core_work_result::noop>

   class execution_context_algo_buff_stack_type;


   /// Enforce a sequential-consistency memory-model on the result data that this object manages, via the accessors to the result data.

   /**

      The execution_context stores the thread_wk_t inside it, so it is allocated on the stack, not on the heap, which is only useful for classic, joinable, data-flow operations. This optimisation saves allocating the thread_wk_t on the heap, and also the shared_ptr in it can have a dummy sp_counter_type, noop_atomic_ctr, because the counter doesn't do anything, for a greater saving on atomic operations.

      No horizontal threading is done in this specialisation, because a master thread distributes the work.


      \todo Add an extra template parameter to supply modifiable functionality to operator*() & operator->(), with a no-op default. Then accumulate, count & find may use this class.


      \see horizontal_execution

   */

   template<template<class, class, class, template<class> class, template<class> class> class AlgCoreWk, class GenWk, class Wk, template<class> class Deref, template<class> class InitCoreWk>

   class execution_context_algo_buff_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, AlgCoreWk, GenWk, Wk, Deref, InitCoreWk> : public eraseable_execution_context_base<typename GenWk::thread_pool_type, noop_dtor, GenWk::thread_pool_type::os_traits::lock_traits::template noop_atomic_ctr> {

   public:

      typedef GenWk gen_wk_t;

      typedef eraseable_execution_context_base<typename gen_wk_t::thread_pool_type, noop_dtor, gen_wk_t::thread_pool_type::os_traits::lock_traits::template noop_atomic_ctr> base_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename base_t::pool_traits_type pool_traits_type;

      typedef typename base_t::os_traits os_traits;

      typedef typename base_t::pool_type pool_type;

      typedef typename base_t::atomic_t atomic_t;

      typedef typename base_t::exception_type exception_type;

      typedef typename thread_pool_type::template create_direct<Wk> creator_t;

      typedef typename creator_t::result_type result_type;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::joinable;


   private:

      typedef Deref<execution_context_algo_buff_stack_type> dereference_ops;

      friend dereference_ops;


      class stack_exec_ctx_helper final {

         using process_fn_traits=typename creator_t::process_fn_traits;

         using closure_t=typename creator_t::closure_t;

         typedef typename largest_par_alg_obj<gen_wk_t>::subdiv_algo_work_t largest_type;


      public:

         /**

            Note how we're pretty damn tricky here, and make use of both the dtor and the counter interfaces, using no-op implementations, because the object is going to be allocated on the stack, we manage the memory ourselves, normally. So we save on avoiding heap allocation.deallocation, but also the atomic-operations that would be required in the counter. for the shared_ptr.

         */

         typedef AlgCoreWk<closure_t, typename gen_wk_t::alg_wrap_t::work_complete_t, largest_type, noop_dtor, os_traits::lock_traits::template noop_atomic_ctr> thread_wk_t;

         typedef InitCoreWk<thread_wk_t> core_wk_res_initialiser_t;


         /**

            To assist in allowing compile-time computation of the algorithmic order of the threading model.

         */

         static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=thread_wk_t::memory_access_mode;

      };

      typedef typename thread_pool_type::template create_direct<gen_wk_t> gen_wk_creator_t;


      void add_work(thread_pool_type &pool, const typename thread_pool_type::pool_type::size_type cliques, const unsigned short default_num_subranges) noexcept(false) FORCE_INLINE {

         BOOST_STATIC_ASSERT(sizeof(execution_context_algo_buff_stack_type)<=os_traits::thread_traits::api_params_type::max_stack_size);

         core_work_.update_edge(thread_pool_type::cfg_type::vertical_edge_annotation);

         stack_exec_ctx_helper::core_wk_res_initialiser_t::init(core_work_);

         typename gen_wk_creator_t::closure_t gen_wk(

            gen_wk_t(

               pool,

               core_work_.closure(),

               work_complete_,

               typename gen_wk_t::algo_work_heap_type(core_work_.algo_work_heap().data(), core_work_.algo_work_heap().max_size(), core_work_.algo_work_heap().stride),

               default_num_subranges,

               cliques

            ),

            typename thread_wk_t::cfg_details_type::params(pool.cfg(), &core_work_.closure(), gen_wk_node_root_str)

         );

         gen_wk.update_edge(thread_pool_type::cfg_type::sequential_edge_annotation);

// TODO Set start & end times of processing op, so that time-order of nodes in DFG can be seen.

         gen_wk.process();

      }


   public:

      typedef typename stack_exec_ctx_helper::thread_wk_t thread_wk_t;

      typedef typename thread_wk_t::work_complete_t work_complete_t;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         base_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && gen_wk_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && stack_exec_ctx_helper::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      execution_context_algo_buff_stack_type(thread_pool_type &pool, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&tw, typename work_complete_t::num_tasks_spawned_t::value_type const i, typename work_complete_t::containers_type const &c, const typename thread_pool_type::pool_type::size_type cliques, const unsigned short default_num_subranges) noexcept(false) FORCE_INLINE

      : base_t(), work_complete_(i, typename gen_wk_t::alg_wrap_t::work_complete_t::containers_type(c)), core_work_(

         work_complete_, std::forward<typename thread_wk_t::closure_t::argument_type>(tw),

         gen_wk_t::compute_buffer_items(gen_wk_t::compute_threads_per_clique(pool.pool_size(), cliques)),

         p

      ) {

         // Ensure any resizing of the output collection is done whilst the inputs & output collections are locked to avoid the inputs being resized in the mean-time.

         Wk::resize_output(core_work_);

         add_work(pool, cliques, default_num_subranges);

      }

      execution_context_algo_buff_stack_type(thread_pool_type &pool, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&tw, typename work_complete_t::num_tasks_spawned_t::value_type const i, typename work_complete_t::containers_type const &c, const typename thread_pool_type::pool_type::size_type cliques, const unsigned short default_num_subranges, typename work_complete_t::containers_type::size_type const out_colln_size) noexcept(false) FORCE_INLINE

      : base_t(), work_complete_(i, typename gen_wk_t::alg_wrap_t::work_complete_t::containers_type(c)), core_work_(

         work_complete_, std::forward<typename thread_wk_t::closure_t::argument_type>(tw),

         gen_wk_t::compute_buffer_items(gen_wk_t::compute_threads_per_clique(pool.pool_size(), cliques)),

         p

      ) {

         core_work_.resize_output(out_colln_size);

         add_work(pool, cliques, default_num_subranges);

      }


      /**

         This needs to be declared, to be standards compliant, but needn't be defined, as cctor elision doesn't require the definition.

      */

      execution_context_algo_buff_stack_type(execution_context_algo_buff_stack_type const &);

      execution_context_algo_buff_stack_type(execution_context_algo_buff_stack_type &&);


      /**

         In case the user didn't specifically call wait(), operator*() or operator->() for some reason, throw any registered exception. (I don't throw the exception in the thread's destructor in the thread pool, as this is too late, and makes evil memory leaks in the thread pool destructor.)

      */

      ~execution_context_algo_buff_stack_type() noexcept(false) FORCE_INLINE {

         assert(dynamic_cast<thread_wk_t *>(&core_work_));

         core_work_.throw_any_exception();

      }


      /// Can't automatically convert to a base-class address automatically - to maintain the concept that this is a stack allocated object.

      void operator&()=delete;

      /// Attempt to remove the ability to subvert the safety by incorrectly casting the execution_context.

      template<class T> operator T () const=delete;

      /// Attempt to remove the ability to subvert the safety by incorrectly casting the execution_context.

      template<class T> operator T ()=delete;

      void operator=(execution_context_algo_buff_stack_type const &)=delete;

      void operator=(execution_context_algo_buff_stack_type &&)=delete;


      /// A (potentially blocking) access to the results, but only after they are written.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations. Throws if the work has been previously erased.


         \see work_done()

      */

      typename dereference_ops::const_ref_result_type __fastcall operator*() const noexcept(false) FORCE_INLINE {

         return dereference_ops::deref(*this);

      }

      /// A (potentially blocking) access to the results, but only after they are written.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows write access to the results (in the current stack-frame, i.e. thread), but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. Throws if the work has been previously erased.


         \see work_done()

      */

      typename dereference_ops::ref_result_type __fastcall operator*() noexcept(false) FORCE_INLINE {

         return dereference_ops::deref(*this);

      }


      /// A (potentially blocking) access to the results, but only after they are written, or process other work from the signalled_work_queue or batch whilst wait for the core_work to be processed.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. (This operator has been provided to allow chaining of "operator->()"s by the compiler.) Throws if the work has been previously erased.


         \see work_done(), get_results(), wait_or_horizontal_thread(), pool::batch_details

      */

      typename dereference_ops::const_addr_result_type __fastcall operator->() const noexcept(false) FORCE_INLINE {

         return dereference_ops::arrow(*this);

      }

      /// A (potentially blocking) access to the results, but only after they are written, or process other work from the signalled_work_queue or batch whilst wait for the core_work to be processed.

      /**

         Obtain the results of the mutation of the input work. Note that this is a potentially blocking call: it will return only when the mutation has been signalled as completed. i.e. the work has been transferred, joinably to the pool, then executed, and not erased beforehand. Also note that this may throw an exception of the type specified by any of the exception specifications that may have been used when transferring the work to the pool. This function allows write access to the results (in the current stack-frame, i.e. thread), but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. (This operator has been provided to allow chaining of "operator->()"s by the compiler.) Throws if the work has been previously erased.


         \see work_done(), get_results(), wait_or_horizontal_thread(), pool::batch_details

      */

      typename dereference_ops::addr_result_type __fastcall operator->() noexcept(false) FORCE_INLINE {

         return dereference_ops::arrow(*this);

      }


      /// Ensure that if an execution context is passed to another function, only a constant version may be passed.

      /**

         This function allows read access to the results, but only after the other thread has written, thus implying a sequential ordering of memory operations by the current thread. i.e. only one thread can write to the results, but many can read. Because multiple read operations do not require locking with respect to each other. Note that if the results are accessed, then that call will block, as necessary, until the results are written, thus ensuring that all reads follow any writes to the contained data.

      */

      const execution_context_algo_buff_stack_type * __fastcall operator&() const noexcept(true) override FORCE_INLINE {

         return this;

      }


   protected:

      mutable work_complete_t work_complete_;

      thread_wk_t core_work_;


      thread_wk_t const & __fastcall core_work() const noexcept(true) final override FORCE_INLINE {

         return core_work_;

      }

      work_complete_t &__fastcall work_complete() noexcept(true) final override FORCE_INLINE {

         return work_complete_;

      }

      work_complete_t &__fastcall work_complete() const noexcept(true) final override FORCE_INLINE {

         return work_complete_;

      }


   private:

      template<class ExCxt> friend class call_push_back;


      typename add_ref_if_not_void<result_type const>::type __fastcall get_results() const noexcept(false) FORCE_INLINE {

         if (this->has_work()) {

            this->wait_or_horizontal_thread();

            // Right - we now check for any uncaught (by the user type, that is) exceptions that were caught by the thread class, and throw it.

            core_work_.throw_any_exception();

            return add_ref_if_not_void<result_type const>::execute(core_work_);

         } else {

            throw exception_type(_T("No results: work previously erased from the pool."), info::function(__LINE__, __PRETTY_FUNCTION__, typeid(*this)), JMMCG_REVISION_HDR(_T(LIBJMMCG_VERSION_NUMBER)));

         }

      }


      typename add_ref_if_not_void<result_type>::type __fastcall get_results() noexcept(false) FORCE_INLINE {

         if (this->has_work()) {

            this->wait_or_horizontal_thread();

            // Right - we now check for any uncaught (by the user type, that is) exceptions that were caught by the thread class, and throw it.

            core_work_.throw_any_exception();

            return add_ref_if_not_void<result_type>::execute(core_work_);

         } else {

            throw exception_type(_T("No results: work previously erased from the pool."), info::function(__LINE__, __PRETTY_FUNCTION__, typeid(*this)), JMMCG_REVISION_HDR(_T(LIBJMMCG_VERSION_NUMBER)));

         }

      }

   };


   /// Enforce a sequential-consistency memory-model on the result data that this object manages, via the accessors to the result data, but also allow horizontal threading: if the object being managed has not yet had its result computed, and the queue in the thread_pool is not empty, process an item from that queue in the mean-time.

   /**

      This ensures that resource starvation (of threads) cannot occur, as no longer does waiting upon a dereference of an execution context block that thread, as that thread can process other work in the mean-time, which is important for finite-sized thread_pools. This specialisation performs horizontal threading, and the horizontal thread is also created on the stack, so with cache locality and all, quite good at stealing work.

      The execution_context stores the algo_thread_wk inside it, so it is allocated on the stack, not on the heap, which is only useful for classic, joinable, data-flow operations. This optimisation saves allocating the algo_thread_wk on the heap, and also the shared_ptr in it can have a dummy sp_counter_type, because the counter doesn't do anything, for a greater saving on atomic operations.


      \see horizontal_execution

   */

   template<template<class, class, class, template<class> class, template<class> class> class AlgCoreWk, class GenWk, class Wk, template<class> class Deref, template<class> class InitCoreWk>

   class execution_context_algo_buff_stack_type<pool_traits::work_distribution_mode_t::worker_threads_get_work<typename GenWk::thread_pool_type::work_distribution_mode::queue_model>, generic_traits::return_data::joinable, AlgCoreWk, GenWk, Wk, Deref, InitCoreWk> final : public execution_context_algo_buff_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, AlgCoreWk, GenWk, Wk, Deref, InitCoreWk>, public horizontal_execution<generic_traits::return_data::joinable, typename GenWk::thread_pool_type, noop_dtor, GenWk::thread_pool_type::os_traits::lock_traits::template noop_atomic_ctr> {

   public:

      typedef execution_context_algo_buff_stack_type<pool_traits::work_distribution_mode_t::one_thread_distributes<>, generic_traits::return_data::joinable, AlgCoreWk, GenWk, Wk, Deref, InitCoreWk> base_t;

      typedef horizontal_execution<generic_traits::return_data::joinable, typename GenWk::thread_pool_type, noop_dtor, GenWk::thread_pool_type::os_traits::lock_traits::template noop_atomic_ctr> base2_t;

      typedef typename base_t::thread_pool_type thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;   ///< The pool traits.

      typedef typename thread_pool_type::os_traits os_traits;

      typedef typename thread_pool_type::pool_type pool_type;

      typedef typename os_traits::lock_traits::anon_event_type atomic_t;

      typedef typename os_traits::exception_type exception_type;

      typedef typename base_t::thread_wk_t thread_wk_t;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::joinable;


      /**

         To assist in allowing compile-time computation of the algorithmic order of the threading model.

      */

      static constexpr ppd::generic_traits::memory_access_modes memory_access_mode=(

         base_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         && base2_t::memory_access_mode==ppd::generic_traits::memory_access_modes::crew_memory_access

         ? ppd::generic_traits::memory_access_modes::crew_memory_access

         : ppd::generic_traits::memory_access_modes::erew_memory_access

      );


      execution_context_algo_buff_stack_type(thread_pool_type &pool, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&tw, typename thread_wk_t::work_complete_t::num_tasks_spawned_t::value_type const i, typename thread_wk_t::work_complete_t::containers_type const &c, const typename thread_pool_type::pool_type::size_type cliques, const unsigned short default_num_subranges) noexcept(false) FORCE_INLINE

      : base_t(pool, p, std::forward<typename thread_wk_t::closure_t::argument_type>(tw), i, c, cliques, default_num_subranges), base2_t(pool, this->core_work_) {

         BOOST_STATIC_ASSERT(sizeof(execution_context_algo_buff_stack_type)<=os_traits::thread_traits::api_params_type::max_stack_size);

      }

      execution_context_algo_buff_stack_type(thread_pool_type &pool, typename thread_wk_t::cfg_details_type::params const &p, typename thread_wk_t::closure_t::argument_type &&tw, typename thread_wk_t::work_complete_t::num_tasks_spawned_t::value_type const i, typename thread_wk_t::work_complete_t::containers_type const &c, const typename thread_pool_type::pool_type::size_type cliques, const unsigned short default_num_subranges, typename thread_wk_t::work_complete_t::containers_type::size_type const out_colln_size) noexcept(false) FORCE_INLINE

      : base_t(pool, p, std::forward<typename thread_wk_t::closure_t::argument_type>(tw), i, c, cliques, default_num_subranges, out_colln_size), base2_t(pool, this->core_work_) {

         BOOST_STATIC_ASSERT(sizeof(execution_context_algo_buff_stack_type)<=os_traits::thread_traits::api_params_type::max_stack_size);

      }


      /**

         This needs to be declared, to be standards compliant, but needn't be defined, as cctor elision doesn't require the definition.

      */

      execution_context_algo_buff_stack_type(execution_context_algo_buff_stack_type const &);

      execution_context_algo_buff_stack_type(execution_context_algo_buff_stack_type &&);


      ~execution_context_algo_buff_stack_type() FORCE_INLINE {

      }


      void operator=(execution_context_algo_buff_stack_type const &)=delete;

      void operator=(execution_context_algo_buff_stack_type &&)=delete;


   private:

      template<class ExCxt> friend class call_push_back;

   };


   /// Ensure that the compiler emits an error if attempting to non-joinably create an execution context.

   /**

      This class is not supposed to be constructible. It is just here to allow the compiler to compile the code. Non-joinable transfers to a thread pool never create an execution context. And transfers to a non-joinable thread pool also never create an execution context.

   */

   template<class DM, template<class, class, class, template<class> class, template<class> class> class AlgCoreWk, class GenWk, class Wk, template<class> class Deref, template<class> class InitCoreWk>

   class execution_context_algo_buff_stack_type<DM, generic_traits::return_data::nonjoinable, AlgCoreWk, GenWk, Wk, Deref, InitCoreWk> final : protected non_allocatable {

   public:

      typedef typename GenWk::thread_pool_type thread_pool_type;

      typedef typename thread_pool_type::pool_traits_type pool_traits_type;

      typedef typename thread_pool_type::pool_type pool_type;

      typedef typename pool_traits_type::os_traits os_traits;

      typedef typename os_traits::exception_type exception_type;

      static constexpr generic_traits::return_data result_traits_=generic_traits::return_data::nonjoinable;


      execution_context_algo_buff_stack_type()=delete;

   };


} } } }


#endif