libjmmcg  release_579_6_g8cffd
A C++ library containing an eclectic mix of useful, advanced components.
thread_pool_aspects.hpp
Go to the documentation of this file.
1 #ifndef LIBJMMCG_CORE_THREAD_POOL_TRAITS_HPP
2 #define LIBJMMCG_CORE_THREAD_POOL_TRAITS_HPP
3 
4 /******************************************************************************
5 ** Copyright © 2004 by J.M.McGuiness, coder@hussar.me.uk
6 **
7 ** This library is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU Lesser General Public
9 ** License as published by the Free Software Foundation; either
10 ** version 2.1 of the License, or (at your option) any later version.
11 **
12 ** This library is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ** Lesser General Public License for more details.
16 **
17 ** You should have received a copy of the GNU Lesser General Public
18 ** License along with this library; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
22 #include "batch.hpp"
23 #include "intrusive.hpp"
24 #include "priority_queue.hpp"
27 
28 #include <array>
29 #include <queue>
30 
31 namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace ppd {
32 
33  namespace lock {
34 
35  template<
36  class St,
37  St UnSig,
38  St Pri
39  >
41  public:
51  using states=St;
53  static constexpr states unsignalled=UnSig;
54  static constexpr states priority=Pri;
55 
56  /**
57  To assist in allowing compile-time computation of the algorithmic order of the threading model.
58  */
60 
61  private:
63 
64  public:
65 
66  BOOST_MPL_ASSERT((std::is_same<typename write_lock_type::atomic_t, locker_type>));
67 
68  constexpr new_event_signal() noexcept(true) FORCE_INLINE
70  }
72 
73  static locker_type &__fastcall locker() noexcept(true) FORCE_INLINE {
74  static locker_type lk;
75  return lk;
76  }
77  atomic_state_type __fastcall set_nolk(states const s) noexcept(noexcept(semaphore.set())) FORCE_INLINE {
78  if (state_!=priority) {
79  state_=s;
80  }
81  return semaphore.set();
82  }
83  atomic_state_type __fastcall set(states const s) noexcept(false) FORCE_INLINE {
84  return set_nolk(s);
85  }
86  atomic_state_type __fastcall reset() noexcept(true) FORCE_INLINE {
88  if (ret==lock_traits::atom_unset) {
90  }
91  return ret;
92  }
93  void clear() noexcept(noexcept(semaphore.try_lock())) FORCE_INLINE {
96  }
97 
98  lock_result_type __fastcall lock() noexcept(noexcept(semaphore.lock())) FORCE_INLINE {
100  }
101  lock_result_type __fastcall lock(const typename lock_traits::timeout_type t) noexcept(noexcept(semaphore.lock(t))) FORCE_INLINE {
103  }
104  lock_result_type __fastcall try_lock() noexcept(noexcept(semaphore.try_lock())) FORCE_INLINE {
106  }
107  lock_result_type __fastcall unlock() noexcept(noexcept(semaphore.unlock())) FORCE_INLINE {
109  }
110  constexpr count_type __fastcall count() const noexcept(noexcept(semaphore.count())) FORCE_INLINE {
111  return semaphore.count();
112  }
113 
114  private:
115  states state_;
116  };
117 
118  }
119 
120  namespace private_ {
121 
122  template<class T>
123  struct no_op final {};
124 
125  template<template<class> class T,class V>
126  struct def_key_compare final {
127  typedef T<V> result;
128  };
129  template<class V>
131  typedef std::less<V> result;
132  };
133 
134  }
135 
136  /// A namespace to hold various traits relating to selecting the specific specialisation of thread_pool they would like.
137  /**
138  These parameters allow the users to choose a thread_pool that has the properties they would like. Note that not all combinations make sense, so will give compilation errors, as they are not implemented, and of the rest, some of those may not be implemented. Contact the author if you need a specific specialisation!
139  */
140  namespace pool_traits {
141 
142  namespace private_ {
143 
144  template<class QM, class EvSts, class CST, template<class> class Stats, class Cont, unsigned long GSSk>
145  struct thread_pool_queue_details_back_batch_queue;
146  template<class EvSts, class CST, template<class> class Stats, class Cont, unsigned long GSSk>
149 
150  /// To be used by the thread_pool to signal that it requires the pool_threads it manages should exit, or have closure_base-derived closure to process.
151  /**
152  This is modelling the functionality of "WaitForMultipleObjects(...)" in the Win32 API.
153  */
155  EvSts,
158  CST
159  >;
160  using lock_all_type=typename lock::any_order::all<
165  >;
166  using element_type=typename Cont::value_type;
167  using value_ret_type=std::array<element_type, GSSk>; ///< Return a container of items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.
168  /// The signalled_work_queue_type is an adapted list to add some thread-safety, because we need to perform multiple operations on it, atomically.
169  /**
170  Basically this has been defined as a FIFO or LIFO queue that has locking semantics defined by the specific collection used, and implements a simple list-based scheduling algorithm that assumes that each work-item is the same work-load.
171 
172  \todo JMG: if the work-load has a user-supplied estimate of the time it would take to complete, then we could implement a more sophisticated and better algorithm, e.g. in [1].
173 
174  The algorithmic complexity of the operations on this container is based upon the complexity of the operations on the underlying container, in this case an instrusive::slist.
175  This specifically implies that adding and removing an item takes constant time.
176 
177  This queue would simply implement a bakers' scheduling algorithm: i.e. the active pool_thread would remove k items from the front of the queue to be mutated by that thread.
178 
179  1. The items in the queue would be a list of work items. i.e. V should be a container.
180  2. When an item is added to the queue, it would be appended to the last item's queue, leaving some distance between the last & first item, e.g. the number of threads in the pool, to try and load-balance the work-load.
181  3. The pool_threads would loop through the list of items that they extract to perform the mutations.
182  4. The items in the queue would be a list of work items.
183  5. The pool_threads would remove at most GSSk work-items from the queue and loop through the list of items that it extracted to perform those mutations.
184 
185  \see back_batch
186  \see queue, funky_queue
187  \see intrusive::slist
188 
189  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
190  */
192 // TODO testing stability & performance: funky_queue<
193  queue<
194  Cont,
196  typename exit_requested_type::locker_type::write_lock_type, ///< See: EREW locking used in the signalled_work_queue_type.
200  >, ///< Also try funky_queue as an alternative queue implementation.
201  GSSk
202  >;
205 
206  /**
207  To assist in allowing compile-time computation of the algorithmic order of the threading model.
208  */
210 
211  /// This is one exit event for all of the threads in the pool: so that they can exit in parallel.
213  /**
214  This implies that the thread_pool assumes a flat memory model, shared across all processors. This may not reflect reality well, in the face of caches.
215  */
217 
220  }
221  };
222  template<pool_traits::work_distribution_mode_t::queue_model_t::stealing_mode_t SM, class EvSts, class CST, template<class> class Stats, class Cont, unsigned long GSSk>
225  /// To be used by the thread_pool to signal that it requires the pool_threads it manages should exit, or have closure_base-derived closure to process.
226  /**
227  Because we spin in the thread we can just use a boolean flag to signal exiting.
228  */
230  EvSts,
233  CST
234  >;
235  using lock_all_type=typename lock::any_order::all<
240  >;
241  using element_type=typename Cont::value_type;
242 
243  /// GSS(k) batching is not supported.
244  BOOST_MPL_ASSERT((std::is_same<std::integral_constant<unsigned long, GSSk>, std::integral_constant<unsigned long, 1UL>>));
245 
246  using value_ret_type=std::array<element_type, GSSk>; ///< Return one item from the front of the queue, GSS(k) or bakers' scheduling algorithm is not supported.
247  /// The signalled_work_queue_type is a lock-free list, because we need to perform multiple operations on it, atomically.
248  /**
249  Basically this has been defined as a LIFO or FIFO queue that lock-free semantics defined by the specific collection used, and implements a simple list-based scheduling algorithm that assumes that each work-item is the same work-load.
250 
251  \todo JMG: if the work-load has a user-supplied estimate of the time it would take to complete, then we could implement a more sophisticated and better algorithm, e.g. in [1].
252 
253  The algorithmic complexity of the operations on this container is based upon the complexity of the operations on the underlying container, in this case an instrusive::slist.
254  This specifically implies that adding and removing an item takes constant time.
255 
256  \see intrusive::slist
257 
258  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
259  */
262  using have_work_type=no_signalling<exit_requested_type>; ///< We're going to busy-wait for new input_work in the pool_threads.
263 
264  /**
265  To assist in allowing compile-time computation of the algorithmic order of the threading model.
266  */
268 
269  /// This is one exit event for all of the threads in the pool: so that they can exit in parallel.
271 
273  : exit_requested_() {
274  }
275  };
276 
277  template<class QM, class EvSts, template<class> class Stats, class V, template<class> class Comp, unsigned long GSSk>
278  struct thread_pool_queue_details_front_batch_priority_queue;
279  template<class EvSts, template<class> class Stats, class V, template<class> class Comp, unsigned long GSSk>
282  /**
283  We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.
284  */
286  V,
287  typename V::base_t::lock_traits
288  >;
289  /// To be used by the thread_pool to signal that it requires the pool_threads it manages should exit, or have closure_base-derived closure to process.
290  /**
291  This is modelling the functionality of "WaitForMultipleObjects(...)" in the Win32 API.
292  */
294  EvSts,
297  typename element_type::lock_traits::critical_section_type ///< The underlying lock-type to use to lock the signalled_work_queue_type, which will be locked EREW style.
298  >;
299  using lock_all_type=typename lock::any_order::all<
304  >;
306  /// The signalled_work_queue_type is an adapted list to add some thread-safety, because we need to perform multiple operations on it, atomically.
307  /**
308  Basically this has been defined as a FIFO queue that has locking semantics defined by the specific collection used, and implements a simple list-based scheduling algorithm that assumes that each work-item is the same work-load.
309 
310  \todo JMG: if the work-load has a user-supplied estimate of the time it would take to complete, then we could implement a more sophisticated and better algorithm, e.g. in [1].
311 
312  The algorithmic complexity of the operations on this container is based upon the complexity of the operations on the underlying container, in this case an instrusive::slist.
313  This specifically implies that adding and removing an item takes constant time.
314 
315  This queue would simply implement a bakers' scheduling algorithm: i.e. the active pool_thread would remove k items from the front of the queue to be mutated by that thread.
316 
317  1. The items in the queue would be a list of work items. i.e. V should be a container.
318  2. When an item is added to the queue, it would be appended to the last item's queue, leaving some distance between the last & first item, e.g. the number of threads in the pool, to try and load-balance the work-load.
319  3. The pool_threads would loop through the list of items that they extract to perform the mutations.
320  4. The items in the queue would be a list of work items.
321  5. The pool_threads would remove at most GSSk work-items from the queue and loop through the list of items that it extracted to perform those mutations.
322 
323  \see back_batch
324  \see queue, funky_queue
325  \see intrusive::slist
326 
327  [1] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
328  */
333  typename exit_requested_type::locker_type::write_lock_type, ///< See: EREW locking used in the signalled_work_queue_type.
335  std::array<element_type, GSSk> ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm, note that these items are returned in the specified priority order, but once removed, it a higher-priority item is added, the removed work will not be modified.
336  >,
337  GSSk
338  >;
341 
342  /**
343  To assist in allowing compile-time computation of the algorithmic order of the threading model.
344  */
346 
347  /// This is one exit event for all of the threads in the pool: so that they can exit in parallel.
349  /**
350  This implies that the thread_pool assumes a flat memory model, shared across all processors. This may not reflect reality well, in the face of caches.
351  */
353 
356  }
357  };
358 
359  template<class TPQD, unsigned long GSSk>
360  struct pool_thread_queue_details;
361  /// The pool_threads share a signalled_work_queue in the thread_pool.
362  template<template<class> class TPQD, unsigned long GSSk>
363  struct pool_thread_queue_details<TPQD<pool_traits::work_distribution_mode_t::queue_model_t::pool_owns_queue>, GSSk> {
365  using thread_pool_queue_details=TPQD<queue_model>;
366  using container_type=typename thread_pool_queue_details::container_type;
367  using os_traits=typename container_type::value_type::value_type::os_traits;
368  using statistics_type=typename thread_pool_queue_details::statistics_type;
369  using batch_type=ppd::private_::batch_details<GSSk, container_type, statistics_type>;
370  /// A resource-efficient event: suspends the waiting thread, and wakes it when the input_work has been processed..
371  using work_complete_t=typename os_traits::lock_traits::anon_event_type;
372  using exit_requested_type=typename thread_pool_queue_details::exit_requested_type;
373  using have_work_type=typename thread_pool_queue_details::have_work_type;
374 
375  /**
376  To assist in allowing compile-time computation of the algorithmic order of the threading model.
377  */
378  static constexpr generic_traits::memory_access_modes memory_access_mode=container_type::memory_access_mode;
379 
380  /**
381  This is the batch that only this thread will process, so it does not need to be thread-safe.
382  */
383  batch_type batch;
384  container_type &signalled_work_queue;
385 
386  explicit pool_thread_queue_details(container_type &q) noexcept(true) FORCE_INLINE
387  : signalled_work_queue(q) {
388  }
389 
390  statistics_type const &statistics() const noexcept(true) FORCE_INLINE {
391  return batch.statistics();
392  }
393  statistics_type &statistics() noexcept(true) FORCE_INLINE {
394  return batch.statistics();
395  }
396  };
397  /// The pool_threads own a signalled_work_queue each, which must be thread-safe, possibly lock-free.
398  /**
399  This implies that the thread_pool assumes a flat memory model, shared across all processors. This may not reflect reality well, in the face of caches.
400  This idea originated in discussions with Colin Egan, this could be instantiated with a lock-free lifo queue (to keep the cache hot) per pool_thread, and allow the pool_threads to steal from one another's queues, but only take from the tail, not the head, so that stolen work should be cold in the cache of the associated pool_thread. Priorities are not supported.
401  According to [1], having a queue per pool_thread with work-stealing between those queues may be more optimal for finer-grained input_work, as long as identifying the queue with most work is efficient.
402 
403  [1] A. Bhattacharjee et al in "Parallelization Libraries: Characterizing and Reducing Overheads" DOI 10.1145/1952998.1953003
404  */
405  template<template<class> class TPQD, pool_traits::work_distribution_mode_t::queue_model_t::stealing_mode_t SM, unsigned long GSSk>
406  struct pool_thread_queue_details<TPQD<pool_traits::work_distribution_mode_t::queue_model_t::thread_owns_queue<SM>>, GSSk> {
408  using thread_pool_queue_details=TPQD<queue_model>;
409  using container_type=typename thread_pool_queue_details::container_type;
410  using statistics_type=typename thread_pool_queue_details::statistics_type;
411  /// GSS(k) batching is not supported, as there is a queue per thread, is owned by the that thread, so very unlikely to gain by batching, only increased complexity in the code.
412  BOOST_MPL_ASSERT((std::is_same<std::integral_constant<unsigned long, GSSk>, std::integral_constant<unsigned long, 1UL>>));
413  using os_traits=typename container_type::value_type::value_type::os_traits;
414  using no_ref_counting=typename container_type::value_type::no_ref_counting;
415  using cfg_type=typename container_type::value_type::value_type::cfg_type;
416 // TODO using work_complete_t=typename os_traits::lock_traits::anon_event_type;
417  using work_complete_t=typename os_traits::lock_traits::anon_spin_event_type;
418  using exit_requested_type=typename thread_pool_queue_details::exit_requested_type;
419  using have_work_type=typename thread_pool_queue_details::have_work_type;
420 
421  /**
422  To assist in allowing compile-time computation of the algorithmic order of the threading model.
423  */
424  static constexpr generic_traits::memory_access_modes memory_access_mode=container_type::memory_access_mode;
425 
426  /**
427  This is the batch that only this thread will process, so it does not need to be thread-safe.
428  The plan is to spin waiting for work in this queue, waiting for work to be added.
429  */
430  container_type batch;
431  statistics_type statistics_;
432 
434  }
435  pool_thread_queue_details(pool_thread_queue_details &&p) noexcept(true) FORCE_INLINE
436  : batch(std::move(p.batch)), statistics_(p.statistics_) {
437  }
438  explicit pool_thread_queue_details(container_type &c) noexcept(true) FORCE_INLINE
439  : batch(c) {
440  }
441 
442  statistics_type const &statistics() const noexcept(true) FORCE_INLINE {
443  return statistics_;
444  }
445  statistics_type &statistics() noexcept(true) FORCE_INLINE {
446  return statistics_;
447  }
448  };
449  }
450 
451  /// The signalled_work_queue_type within the thread_pool will obey strict FIFO semantics.
452  template<
453  class V,
454  template<class> class Comp, ///< The comparator functor is ignored in this case, as this is a fifo signalled_work_queue_type.
455  class EvSts, ///< The states the event-type that is used to signal that the signalled_work_queue_type has work may take.
456  unsigned long GSSk, ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.
457  template<class> class Stats
458  >
459  struct normal_fifo {
460  /**
461  We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.
462  */
463  using internal_container=intrusive::slist<
464  V,
465  typename V::base_t::lock_traits
466  >;
467  using key_compare=ppd::private_::no_op<typename internal_container::value_type>; ///< This is a FIFO signalled_work_queue_type, so no ordering.
469  template<class QM>
471  QM,
472  EvSts,
473  typename internal_container::value_type::lock_traits::critical_section_type, ///< The underlying lock-type to use to lock the signalled_work_queue_type, which will be locked EREW style.
474  Stats,
476  GSSk
477  > {};
478  template<class QM>
479  using pool_thread_queue_details=private_::pool_thread_queue_details<thread_pool_queue_details<QM>, GSSk>;
480  };
481 
482  /// The signalled_work_queue_type within the thread_pool will obey strict LIFO semantics.
483  template<
484  class V,
485  template<class> class Comp, ///< The comparator functor is ignored in this case, as this is a lifo signalled_work_queue_type.
486  class EvSts, ///< The states the event-type that is used to signal that the signalled_work_queue_type has work may take.
487  unsigned long GSSk, ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.
488  template<class> class Stats
489  >
490  struct normal_lifo {
491  /**
492  We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.
493  */
494  using internal_container=intrusive::stack<
495  V,
496  typename V::base_t::lock_traits
497  >;
498  using key_compare=ppd::private_::no_op<typename internal_container::value_type>; ///< This is a LIFO signalled_work_queue_type, so no ordering.
500  /// Adapt the stack to look like a list.
501  struct adaptor : public internal_container {
502  using typename internal_container::value_type;
503 
504  value_type __fastcall front() noexcept(true) FORCE_INLINE {
505  return this->top();
506  }
507  value_type const __fastcall front() const noexcept(true) FORCE_INLINE {
508  return this->top();
509  }
510  void __fastcall push_back(value_type const &v) noexcept(true) FORCE_INLINE {
511  this->push(v);
512  }
513  void __fastcall push_back(value_type &&v) noexcept(true) FORCE_INLINE {
514  this->push(std::forward<value_type>(v));
515  }
516  };
517  template<class QM>
519  QM,
520  EvSts,
521  typename internal_container::value_type::lock_traits::critical_section_type, ///< The underlying lock-type to use to lock the signalled_work_queue_type, which will be locked EREW style.
522  Stats,
523  /**
524  We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.
525  */
526  adaptor,
527  GSSk
528  > {};
529  template<class QM>
530  using pool_thread_queue_details=private_::pool_thread_queue_details<thread_pool_queue_details<QM>, GSSk>;
531  };
532 
533  /// The signalled_work_queue_type within the thread_pool will obey strict LIFO semantics.
534  template<
535  class V,
536  template<class> class Comp, ///< The comparator functor is ignored in this case, as this is a lifo signalled_work_queue_type.
537  class EvSts, ///< The states the event-type that is used to signal that the signalled_work_queue_type has work may take.
538  unsigned long GSSk, ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.
539  template<class> class Stats
540  >
542  /**
543  We don't actually need this to be guaranteed lockfree, as locking is done elsewhere, so we can gain a smidge of performance by using raw pointers.
544  */
545  using internal_container=intrusive::stack<
546  V,
547  typename V::base_t::lock_traits
548  >;
549  using key_compare=ppd::private_::no_op<typename internal_container::value_type>; ///< This is a LIFO signalled_work_queue_type, so no ordering.
550  using lock_traits=typename internal_container::value_type::lock_traits;
551  using os_traits=typename internal_container::value_type::value_type::os_traits;
552  using thread_traits=typename os_traits::thread_traits;
554  /// Adapt the stack to look like a list.
555  struct lockfree_to_safe_colln : public internal_container {
556  using typename internal_container::value_type;
557  using value_ret_type=internal_container;
558 
559  static constexpr unsigned long max_size=1UL;
560 
562  : internal_container() {}
564  : internal_container(std::forward<lockfree_to_safe_colln>(a)) {}
565  value_type __fastcall front() noexcept(true) FORCE_INLINE {
566  return this->top();
567  }
568  value_type const __fastcall front() const noexcept(true) FORCE_INLINE {
569  return this->top();
570  }
571  void __fastcall push_back(value_type const &v) noexcept(true) FORCE_INLINE {
572  this->push(v);
573  }
574  void __fastcall push_back(value_type &&v) noexcept(true) FORCE_INLINE {
575  this->push(std::forward<value_type>(v));
576  }
577  value_type pop_front_1_nochk_nosig() noexcept(true) FORCE_INLINE {
578  return this->pop_top_nochk();
579  }
580  };
581  template<class QM>
583  QM,
584  EvSts,
585  api_lock_traits<platform_api, sequential_mode>::critical_section_type, ///< The container is lockfree, and we'll spin for input_work.
586  Stats,
588  GSSk
589  > {};
590  template<class QM>
591  using pool_thread_queue_details=private_::pool_thread_queue_details<thread_pool_queue_details<QM>, GSSk>;
592  };
593 
594  /// The signalled_work_queue_type within the thread_pool will operate upon work in some user-defined partial order.
595  template<
596  class V,
597  template<class> class Comp, ///< The comparator functor that will be used to implement the partial ordering, which implies the priority.
598  class EvSts, ///< The states the event-type that is used to signal that the signalled_work_queue_type has work may take.
599  unsigned long GSSk, ///< Return a container of GSSk items from the front of the queue to implement the GSS(k) or bakers' scheduling algorithm.
600  template<class> class Stats
601  >
604  template<class QM>
606  QM,
607  EvSts,
608  Stats,
609  V,
610  Comp,
611  GSSk
612  > {};
613  template<class QM>
614  using pool_thread_queue_details=private_::pool_thread_queue_details<thread_pool_queue_details<QM>, GSSk>;
615  };
616 
617  /// The states in which the signalled_work_queue_type can be.
618  enum class states {
619  unsignalled,
622  };
623  }
624 
625  /// The fundamental way to specify the type of thread_pool that is required.
626  /**
627  This class is used to encapsulate the OS-specific threading traits, atomic locks, etc for the thread_pool and other dependent classes.
628  */
629  template<
630  generic_traits::return_data RD, ///< If the thread_pool should implement returning data from the mutated work. i.e. support execution_contexts. i.e. model dataflow.
631  generic_traits::api_type API, ///< The API that should be used, e.g. Win32, posix_pthreads, platform_api, etc.
632  typename Mdl, ///< The threading model, e.g. heavyweight_threads, sequential_mode, etc.
633  template<class, template<class> class, class, unsigned long, template<class> class> class PM, ///< Allows the user to specify the if the thread pool should support prioritization of the work in its input queue, e.g. normal_fifo or prioritised_queue.
634  template<class> class Comp=private_::no_op, ///< The optional comparison operator to use to specify how the partial ordering for the priority of the work. Note that this may restrict the types of work that may be transferred into the thread pool. If a prioritised_queue is chosen, then by default this will be std::less<value_type>. This parameter looks complex because of the declaration of value_type, which is done in this class, so the user does not have access to that opaque type until after this class is declared. Another easier technique the use may use is to define a weak order on their input work which closure_base_t will make use of, and the user may ignore this parameter.
635  unsigned long GSSkSz=1,
636  template<class> class Stats=no_statistics,
637  template<class> class CFG=no_control_flow_graph
638  >
639  struct pool_aspects final {
640  /// If the thread_pool should implement returning data from the mutated work. i.e. support execution_contexts.
642  /// The k-size for the batches to implement GSS(k) scheduling, if >1, then this is effectively the baker's ticket scheduling scheme.
643  /**
644  The size of the batch to be taken in the GSS(k) or bakers' scheduling algorithm. Note that the is what I term as "front_batch"ing: when the tasks extracted from the signalled_work_queue_type in the thread_pool, as opposed to adding to the thread_pool. A value of zero is not allowed. Note that with an average optimizing compiler, there should be no performance loss for a batch-size of one, and higher batch sizes should simply result in reduced contention on the signalled_work_queue_type within the thread_pool. A template parameter is used so that the implementation can allocate on the stack a fixed-size array of tasks, so avoiding calling the memory manager, reducing locks, the converse would defeat the point of GSS(k) scheduling, which is to reduce lock contention!
645 
646  If the GSSk>1 and the first closure_base-derived closure depends upon a later job to complete (with a dependency that is not managed by execution_context's, i.e. a back-edge in the control dependency graph, i.e. not a strictly nested dependency), then that sub-tree of dependent closure_base will deadlock. This is because the processing loop in pool_thread::process() will wait for the first closure_base to complete, which depends upon the second (or later in the batch) closure_base which will not be executed as the earlier closure_base is preventing this loop for continuing. Therefore one must ensure that for GGSk>1, the dependency tree of the closure_base has been carefully constructed. If all is well in sequential_mode, yet fails with GSSk>1 using platform_api, try with GSSk=1, and this will be your issue.
647 
648  \see batch_details::process_a_batch()
649  */
650  static constexpr unsigned long GSSk=GSSkSz;
651 
652  /// The all-important os-traits: used to obtain not only the threading model_traits and generic_traits which provide the abstraction to the underlying threading implementation in the api_threading_traits, but also the api_type, and therefore the api_lock_traits which contains the atomic locks and atomic counters used. So: rather important.
653  typedef thread_os_traits<API, Mdl> os_traits;
654  typedef CFG<os_traits> cfg_type;
655 
656  template<class V> using atomic_wrapper_t=typename os_traits::lock_traits::template atomic_counter_type<V>;
657 
658  /// Some typedefs used as short-hands.
661 
662  /// Some classes used as short-hands.
663  template<generic_traits::return_data RD1, class ThrW, class WFlg, template<class> class Del, template<class> class AtCtr>
664  struct thread_wk;
665  /// Some classes used as short-hands.
666  template<class ThrW, class WFlg, template<class> class Del, template<class> class AtCtr>
669  typedef typename base_t::closure_t closure_t;
673 
675  : base_t(w, std::forward<typename closure_t::argument_type>(tw), p) {
676  }
677  __stdcall ~thread_wk() noexcept(true) FORCE_INLINE {}
678  };
679  /// Some classes used as short-hands.
680  template<class ThrW, class WFlg, template<class> class Del, template<class> class AtCtr>
683  typedef typename base_t::closure_t closure_t;
687 
689  : base_t(std::forward<typename closure_t::argument_type>(tw), p) {
690  }
691 
692  private:
693  friend deleter_t;
694 
695  __stdcall ~thread_wk() noexcept(true) FORCE_INLINE {}
696  };
697  /// Some classes used as short-hands.
698  template<class ThrW, class WFlg, template<class> class Del, template<class> class AtCtr>
699  struct algo_thread_wk final : public private_::closure::algo_thread_wk<os_traits, ThrW, WFlg, Del, AtCtr, cfg_type> {
701  typedef typename base_t::closure_t closure_t;
704 
705  algo_thread_wk(work_complete_t &w, typename closure_t::argument_type &&tw, typename cfg_details_type::params const &p) FORCE_INLINE
706  : base_t(w, std::forward<typename closure_t::argument_type>(tw), p) {
707  }
708  ~algo_thread_wk() noexcept(true) FORCE_INLINE {
709  }
710  };
711  /// Some classes used as short-hands.
712  template<class ThrW, class WFlg, class SubDivAlgWk, template<class> class Del=default_delete, template<class> class AtCtr=atomic_wrapper_t>
713  struct algo_thread_wk_buffered final : public private_::closure::algo_thread_wk_buffered<os_traits, ThrW, WFlg, SubDivAlgWk, Del, AtCtr, cfg_type> {
715  typedef typename base_t::closure_t closure_t;
719 
720  algo_thread_wk_buffered(work_complete_t &w, typename closure_t::argument_type &&tw, typename algo_work_heap_type::size_type const num_objs, typename cfg_details_type::params const &p) FORCE_INLINE
721  : base_t(w, std::forward<typename closure_t::argument_type>(tw), num_objs, p) {
722  }
724  }
725  };
726 
727  private:
728  /// Some typedefs used as short-hands.
729  typedef PM<
730  thread_wk_elem_type,
731  Comp,
732  pool_traits::states,
733  GSSk,
734  Stats
735  > queue_t;
736 
737  public:
738  /// The specific signalled_work_queue_type to be used in the thread_pool.
739  /**
740  This class should combine a container with an atomic event. The event should be set when there are items in the queue and reset when the container becomes empty. This would allow threads to atomically wait upon the container for work to be added to it.
741 
742  \todo Colin Egan suggested that one could consider the asynchronous work transferred into this queue as a set of instructions. (The ISA being generated by the program being compiled, composed of the unique closure_base-derived closure types transferred.) One could then analyse these instructions as sets of basic-blocks, and apply analysis to those basic blocks for code-hoisting, consider trace-scheduling, etc, etc.
743  */
744  template<class QM>
746  template<class QM>
748  template<class QM>
750  template<class QM>
752  template<class QM>
754  /**
755  Note that the parameter to Stats is not atomic, implying that performance over accuracy is preferred. This is by design: performance over accuracy has been preferred and locking reduces performance, and this library has been designed to be fast, so the statistics gathering is consequently less accurate, in particular may be under-estimates. This isn't as bad as it first appears as most SMP architectures implement some form of cache-coherency protocol (e.g. MESI or MOESI) that can correct some of these inaccuracies.
756 
757  A consequence of this is that 'valgrind --tool=helgrind' will report potential race-conditions if, for example basic_statistics, is used. This is not a problem. For speed basic_statistics does not add any locking so the race-conditions are to be expected. Please ignore those warnings, or use the no_statistics class instead.
758 
759  \see no_statistics
760  \see basic_statistics
761  */
762  template<class QM>
764 
765  /// An accessor for getting at the priority mode that the thread_pool may support.
767  };
768 
769  template<
770  class DM,
772  class P
773  >
774  class thread_pool;
775 
776 } } }
777 
778 #endif