libjmmcg  release_579_6_g8cffd
A C++ library containing an eclectic mix of useful, advanced components.
thread_pool_base.hpp
Go to the documentation of this file.
1 #ifndef LIBJMMCG_CORE_PRIVATE_THREAD_POOL_BASE_HPP
2 #define LIBJMMCG_CORE_PRIVATE_THREAD_POOL_BASE_HPP
3 
4 /******************************************************************************
5 ** Copyright © 2010 by J.M.McGuiness, coder@hussar.me.uk
6 **
7 ** This library is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU Lesser General Public
9 ** License as published by the Free Software Foundation; either
10 ** version 2.1 of the License, or (at your option) any later version.
11 **
12 ** This library is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ** Lesser General Public License for more details.
16 **
17 ** You should have received a copy of the GNU Lesser General Public
18 ** License along with this library; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 
23 #include "../../core/thread_pool_aspects.hpp"
25 #include "../../core/thread_statistics.hpp"
26 
27 namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace ppd { namespace private_ {
28 
29 static constexpr char const node_details_acc_op[]="accumulate_op";
30 static constexpr char const node_details_acc[]="accumulate";
31 static constexpr char const max_element_str[]="max_element";
32 static constexpr char const min_element_str[]="min_element";
33 
34 template<class V, class>
36  typedef V value_type;
37 };
38 template<class V, class LkT>
39 struct remove_shared_ptr<shared_ptr<V, LkT>, LkT> {
40  typedef V value_type;
41 };
42 
43 /// Base class for all multi-threaded thread_pools.
44 /**
45  thread_pool_base is for a finite-sized pool of threads (possibly large). You transfer work to the pool, and may then obtain the results of that work via an execution_context class that allows you to get the result (automatically waiting if necessary for the work to be processed). The closure_base-derived closure is processed in the order in which it is transferred to the pool. When the pool goes out of scope, all threads within the pool are given a grace period to end their work, and if necessary those unfinished threads are terminated.
46  Note that the work scheduling technique in these pools is the baker's ticket scheduling algorithm. Because the thread_wk_t in the queue is owned by the queue and the execution_contexts are strictly automatic variables, the current list of work in the queue is guaranteed to be strictly independent of each other. This is an important result: it means that the thread_pool is free to re-schedule the work in any way it chooses, and the program correctness is guaranteed.
47 
48  Notes:
49  ======
50  1. Both thread_pool and thread_wk_t have various traits. These specialise the pool implementation for various architectural features. e.g. Win32 & Posix threads are very slow to create, synchronise & destroy. For other architectures this may not be the case. Appropriate specialisations may be provided.
51  2. Currently the implementation is biassed towards non-cellular threading, or "heavyweight" threading. This may not be the case in the future.
52  3. More traits on the pool allow for alternative work-sharing techniques. Currently in the statically sized pool (pool_traits::work_distribution_mode_t::worker_threads_get_work) the workers steal work from the signalled_work_queue. Alternatively for a master-slave approach (pool_traits::work_distribution_mode_t::one_thread_distributes<>) in the thread_wk_t class, the work is assigned to threads via the "main" thread that performs the work assignment. Both of these techniques have their limitations, but this may not be the case in the future.
53  4. std::remove() has not been implemented because it returns an iterator to the partitioned collection. Iterators into collections have difficulties in a multi-threaded environment because the iterator shouldn't be invalidated by subsequent operations on the collection, and the order of these subsequent operations is hard to predict in a multi-threaded environment. In fact PPD eschews iterators and uses the collections directly, which is effectively a "range" instead, for those reasons.
54 
55  Example usage:
56  ==============
57  See the examples directory.
58 */
59 template<
60  class DM, ///< The work distribution mode, selected from the types in the work_distribution_mode_t namespace.
61  pool_traits::size_mode_t Ps, ///< The size model of the pool, selected from size_mode_t.
62  typename PTT, ///< The pool_aspects that contains many more properties of the thread_pool.
63  class Pt ///< Not for the user. The container for the pool_threads, used internally by the library.
64 >
65 class thread_pool_base;
66 
67 template<
68  class DM,
70  typename PTT,
71  class Pt
72 >
73 class thread_pool_base {
74 private:
75  /**
76  To avoid a race condition in executing the first task by a pool_thread and calling execution_context::get_results().
77  */
78  static constexpr unsigned short init_num_jobs_par_alg=1;
79  static constexpr unsigned short init_num_jobs_par_alg_other=0;
80  /**
81  In parallel algorithms we use the full range of the input collection by default.
82  */
83  static constexpr unsigned short default_num_subranges=1;
84 
85 public:
86  using pool_traits_type=PTT;
87  using pool_type=Pt;
88  using pool_size_type=typename pool_type::size_type;
89  using pool_thread_type=typename remove_shared_ptr<typename pool_type::value_type, api_lock_traits<platform_api, sequential_mode>>::value_type;
90  /// A useful typedef to easily get to the various OS traits.
91  using os_traits=typename pool_traits_type::os_traits;
92  /// All exceptions thrown by this class are of this type, or derived from it.
93  /**
94  The exception class in this class is called "exception". This is, naturally, eventually derived from "std::exception".
95 
96  \see exception_type
97  */
98  using exception_type=typename os_traits::exception_type;
99  /// A useful typedef to easily get to the various OS specific thread-traits.
100  using thread_traits=typename os_traits::thread_traits;
101  /// A useful typedef to easily get to the various API details.
102  using api_params_type=typename thread_traits::api_params_type;
103  /// A useful typedef to easily get to the various priorities.
104  using priority_type=typename api_params_type::priority_type;
105  using work_distribution_mode=DM;
106  using pool_thread_queue_details=typename pool_traits_type::template pool_thread_queue_details<typename work_distribution_mode::queue_model>;
107  using statistics_type=typename pool_thread_queue_details::statistics_type;
108  using signalled_work_queue_type=typename pool_traits_type::template signalled_work_queue_type<typename work_distribution_mode::queue_model>;
109  using queue_size_type=typename signalled_work_queue_type::size_type;
110 
111  static constexpr pool_traits::size_mode_t size_mode=Ps;
112  /**
113  To assist in allowing compile-time computation of the algorithmic order of the threading model.
114  */
115  static constexpr generic_traits::memory_access_modes memory_access_mode=((pool_traits_type::template thread_pool_queue_details<typename work_distribution_mode::queue_model>::memory_access_mode==generic_traits::memory_access_modes::crew_memory_access && pool_traits_type::template pool_thread_queue_details<typename work_distribution_mode::queue_model>::memory_access_mode==generic_traits::memory_access_modes::crew_memory_access) ? generic_traits::memory_access_modes::crew_memory_access : generic_traits::memory_access_modes::erew_memory_access);
116 
117  /// The type of the control-flow graph that will be generated at run-time, if supported.
118  /**
119  \see cfg(), dummy_control_flow_graph, control_flow_graph
120  */
121  using cfg_type=typename pool_traits_type::cfg_type;
122  /// A useful typedef to easily get to the nonjoinable grammar element.
123  /**
124  \see nonjoinable_t
125  */
126  using nonjoinable=nonjoinable_t<thread_pool_base>;
127  /// A useful typedef to easily get to the joinable grammar element.
128  /**
129  \see joinable_t
130  */
131  using joinable=joinable_t<thread_pool_base>;
132  /// A useful typedef to easily get to the nonjoinable_buff grammar element.
133  /**
134  Used internally be the library.
135 
136  \see nonjoinable_buff_t
137  */
138  using nonjoinable_buff=nonjoinable_buff_t<thread_pool_base>;
139  /**
140  Used internally be the library.
141  */
142  template<priority_type Pri> struct priority {};
143  /// This is a useful typedef to get at the execution_context. The work is allocated on the stack, inside this type.
144  /**
145  Used internally be the library.
146 
147  The execution_context is created by joinably transferring work into the pool. It has various uses, but is primarily used to atomically and synchronously wait on the results of the work on the closure_base-derived closure-derived object, as specified by the thread_wk_t object transferred into the pool. But it can also pass back specified exceptions that may be thrown by the work. It can also be used to asynchronously test if the work has been completed, and delete the work from the pool, if it has not been started.
148 
149  \see create_direct
150  \see execution_context_stack_type
151  \see joinable_t
152  \see closure_base
153  */
154  template<class InpWk>
155  struct execution_context_stack;
156  /// Used by the library to implicitly generate a closure from the InpWk type.
157  /**
158  Used internally be the library.
159  */
160  template<
161  typename InpWk, ///< The closure_base-derived closure type. The result_type is inferred from the process(result_type &) or process() member-functions declared in the Wk type. Note that the process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it.
162  class PtrFnType=decltype(&std::remove_reference<InpWk>::type::process) ///< The default mutator function is called process(), but you could provide an alternative member-function name if desired, as long as the signature is correct. Note that exception-specifications are not supported at all, including "throw()", but exception-declarations, such as noexcept are supported.
163  >
164  struct create_direct;
165 
166 private:
167  /**
168  Used internally be the library.
169  */
170  typedef stl_functor_result_type<bool> boolean_result_type;
171 
172  /// A modifier to allow joinably transferring the work to the pool.
173  /**
174  Used internally be the library.
175 
176  \see for_each(), for_each_work_type, for_each_reduce, subdivide_n_gen_wk1
177  */
178  template<
179  class Colln, ///< The adapted collection to iterate over.
180  class Fn
181  >
182  class for_each_t;
183 
184  /// A modifier to allow joinably transferring the work to the pool.
185  /**
186  Used internally be the library.
187 
188  \see count_if(), counter, count_if_reduce, subdivide_n_gen_wk1
189  */
190  template<
191  class Colln, ///< The adapted collection to search.
192  typename Pred ///< The predicate to be used to find the value to be counted.
193  >
194  class count_if_t;
195 
196  /// A modifier to allow joinably transferring the work to the pool.
197  /**
198  Used internally be the library.
199 
200  \see count(), counter, count_if_reduce, subdivide_n_gen_wk1
201  */
202  template<
203  class Colln ///< The adapted collection to search.
204  >
205  struct count_t;
206 
207  /// A modifier to allow joinably transferring the work to the pool.
208  /**
209  Used internally be the library.
210 
211  \see find_if(), countor_work_type, find_if_reduce, subdivide_n_gen_wk1
212  */
213  template<
214  class Colln, ///< The adapted collection to search.
215  typename Pred ///< The predicate to be used to find the value to be counted.
216  >
217  class find_if_t;
218 
219  /// A modifier to allow joinably transferring the work to the pool.
220  /**
221  Used internally be the library.
222 
223  \see find(), countor_work_type, find_if_reduce, subdivide_n_gen_wk1
224  */
225  template<
226  class Colln ///< The adapted collection to search.
227  >
228  struct find_t;
229 
230  /// A modifier to allow joinably transferring the work to the pool.
231  /**
232  Used internally be the library.
233 
234  \see transform(), for_each_work_type, subdivide_n_gen_wk2
235  */
236  template<
237  typename CollnIn, ///< The adapted collection to transform.
238  typename CollnOut, ///< The adapted collection to output into.
239  typename UniOp
240  >
241  class transform_t;
242 
243  /// A modifier to allow joinably transferring the work to the pool.
244  /**
245  Used internally be the library.
246 
247  \see transform(), transform_t
248  */
249  template<
250  typename CollnIn, ///< The adapted collection to transform.
251  typename CollnOut, ///< The adapted collection to output into.
252  class IterIn, ///< The iterator type to be used to iterate over the collections.
253  typename UniOp ///< The unary operator to apply to the input & generate the output.
254  >
255  class transform_iter_t;
256 
257  /// A modifier to allow joinably transferring the work to the pool.
258  /**
259  Used internally be the library.
260 
261  \see transform(), for_each_work_type, transform2_reduce, subdivide_n_gen_wk3
262  */
263  template<
264  typename CollnIn1, ///< The adapted collection to transform.
265  typename CollnIn2, ///< The adapted collection to transform.
266  typename CollnOut, ///< The adapted collection to output into.
267  typename BinOp ///< The binary operator to apply to the input & generate the output.
268  >
269  class transform2_t;
270 
271  /// We use a functor to initialise the value in map_reduce_t, to ensure that it is run whilst locks are taken out on the container, to avoid race-conditions.
272  /**
273  Used internally be the library.
274  */
275  template<class Res>
276  struct map_red_initialiser;
277  /// A modifier to allow joinably transferring the work to the pool.
278  /**
279  Used internally be the library.
280 
281  \see accumulator_work_type, subdivide_n_gen_wk1
282  */
283  template<
284  class Colln, ///< The adapted collection to search.
285  typename BinOp, ///< The binary functor to be used to accumulate the result.
286  class V, ///< The result-type to be returned by the initialiser, to avoid problems with explicit conversions being required.
287  template<class, class> class Reduce, ///< The reduction operation to perform.
288  class Init ///< The initialiser to be used to initialise the result value.
289  >
290  class map_reduce_t;
291 
292  /// We use a functor to initialise the value in map_reduce_t, to ensure that it is run whilst locks are taken out on the container, to avoid race-conditions.
293  /**
294  Used internally be the library.
295 
296  \see max_element_t
297  */
298  template<
299  class Colln ///< The adapted collection to searched.
300  >
301  struct max_element_initialiser;
302  /// We use a functor to initialise the value in map_reduce_t, to ensure that it is run whilst locks are taken out on the container, to avoid race-conditions.
303  /**
304  Used internally be the library.
305 
306  \see min_element_t
307  */
308  template<
309  class Colln ///< The adapted collection to searched.
310  >
311  struct min_element_initialiser;
312 
313  /// A modifier to allow joinably transferring the work to the pool.
314  /**
315  Used internally be the library.
316 
317  \see accumulate(), accumulate_processor, accumulate_reduce
318  */
319  template<
320  class Colln, ///< The adapted collection to searched.
321  class BinOp ///< The binary functor to be used to accumulate the result.
322  >
323  struct accumulate_op_processor;
324 
325  /// A modifier to allow joinably transferring the work to the pool.
326  /**
327  Used internally be the library.
328 
329  \see accumulate(), accumulate_op_processor, accumulate_reduce
330  */
331  template<
332  class Colln, ///< The adapted collection to search.
333  class V ///< The result-type to be returned by the initialiser, to avoid problems with explicit conversions being required.
334  >
335  struct accumulate_processor;
336 
337  /// A modifier to allow joinably transferring the work to the pool.
338  /**
339  Used internally be the library.
340 
341  \see map_reduce_t, max_element_initialiser, alg_wk_wrap::max_element_reduce
342  */
343  template<
344  class Colln, ///< The adapted collection to search.
345  class Comp=std::less<typename Colln::value_type> ///< The comparator to use to compare the items.
346  >
347  struct max_element_t;
348  /// A modifier to allow joinably transferring the work to the pool.
349  /**
350  Used internally be the library.
351 
352  \see map_reduce_t, min_element_initialiser, alg_wk_wrap::min_element_reduce
353  */
354  template<
355  class Colln, ///< The adapted collection to search.
356  class Comp=std::less<typename Colln::value_type> ///< The comparator to use to compare the items.
357  >
358  struct min_element_t;
359 
360  /// A modifier to allow joinably transferring the work to the pool.
361  /**
362  Used internally be the library.
363 
364  \see fill_n(), pass_value, fill_n_reduce, subdivide_n_gen_wk1
365  */
366  template<
367  class Colln ///< The adapted collection to fill.
368  >
369  class fill_n_t;
370 
371  /// A modifier to allow joinably transferring the work to the pool.
372  /**
373  Used internally be the library.
374 
375  \see fill(), pass_value, fill_reduce, subdivide_n_gen_wk1
376  */
377  template<
378  class Colln ///< The adapted collection to fill.
379  >
380  class fill_t;
381 
382  /// A modifier to allow joinably transferring the work to the pool.
383  /**
384  Used internally be the library.
385 
386  \see fill(), pass_value, fill_reduce, subdivide_n_gen_wk1
387  */
388  template<
389  class Colln ///< The collection to reverse.
390  >
391  class reverse_t;
392 
393  /// A modifier to allow joinably transferring the work to the pool.
394  /**
395  Used internally be the library.
396 
397  \see merge(), subdivide_n_gen_wk2
398  */
399  template<
400  typename CollnIn1, ///< The adapted collection to merge.
401  typename CollnIn2, ///< The adapted collection to merge.
402  typename CollnOut, ///< The adapted collection to output into.
403  typename Compare ///< The binary functor to be used as the comparator to order the items in the output.
404  >
405  class merge_t;
406 
407  /// A modifier to allow joinably transferring the work to the pool.
408  /**
409  Used internally be the library.
410 
411  \see sort(), subdivide_n_gen_wk1
412  */
413  template<
414  class Colln, ///< The adapted collection to sort.
415  typename Compare ///< The binary functor to be used as the comparator to order the items in the output.
416  >
417  class sort_t;
418 
419 public:
420  /// A modifier to allow joinably transferring the work to the pool.
421  /**
422  Used internally be the library.
423 
424  \see transform(), transform_t
425  transform_iter_t<CollnIn, CollnOut, IterIn, noop<typename CollnOut::value_type>
426  */
427  template<
428  typename CollnIn, ///< The adapted collection to be copied.
429  typename CollnOut, ///< The adapted collection to output into.
430  class IterIn ///< The iterator type to be used to iterate over the collections.
431  >
432  struct copy_iter_t;
433 
434  /// A modifier to allow joinably transferring the work to the pool.
435  /**
436  Used internally be the library.
437 
438  \see swap_ranges(), for_each_work_type, swap_ranges_reduce, subdivide_n_gen_wk1
439  */
440  template<
441  class Colln, ///< The adapted collection to swap the elements.
442  class Pred ///< The binary functor to be used as the comparator to order the items in the output.
443  >
444  class swap_ranges_t;
445 
446  const pool_size_type max_num_threads_in_pool;
447 
448  thread_pool_base(thread_pool_base const &)=delete;
449  /**
450  Note that it is recommended that the thread_pool is created on the stack, within main(). Instantiating a thread_pool outside main(), for example as a singleton, is possible and should work, but orderly destruction of the thread_pool is not guaranteed outside main(), and is likely to cause an abort() or core to be dumped. If a singleton outside main() must be used it is recommended to allocate the thread_pool on the heap, and leak it at exit, hoping the OS will clean up all resources. Currently, the thread_pool does not use any inter-process shared memory or global named objects (but this is not guaranteed in the future), so in principle the OS should be able to recover all that would be leaked.
451  */
452  virtual __stdcall ~thread_pool_base() noexcept(false) FORCE_INLINE {
453  }
454 
455  /// Returns true if there no threads in the thread_pool.
456  /**
457  \return true if there no threads in the thread_pool.
458  */
459  virtual bool __fastcall pool_empty() const noexcept(true)=0;
460  /// Returns the current number of threads in the thread_pool.
461  /**
462  \return The current number of threads in the thread_pool.
463  */
464  virtual const pool_size_type __fastcall pool_size() const noexcept(true)=0;
465  /// Returns true if there is no work to process by the thread_pool.
466  /**
467  \return true if there is no work to process by the thread_pool.
468  */
469  virtual bool __fastcall queue_empty() const noexcept(true)=0;
470  /// Returns the current amount of outstanding, unscheduled work items to be processed by the thread_pool.
471  /**
472  \return The current amount of outstanding, unscheduled work items to be processed by the thread_pool.
473  */
474  virtual const queue_size_type __fastcall queue_size() const noexcept(true)=0;
475 
476  /// Remove all of the outstanding tasks from the thread pool.
477  /**
478  Note that this may cause deadlocks if the tasks are inter-dependent.
479  */
480  virtual void __fastcall queue_clear() noexcept(false)=0;
481 
482  /// Obtain access to any statistics data collected by the operation of the thread_pool.
483  virtual statistics_type const __fastcall statistics() const =0;
484 
485  /// Return the theoretical minimum time in computations according to section 3.3 & Theorem 3.3 in [1] required to complete the current work with the current number of threads in the pool using a CREW-PRAM and according to section 1.3.2, Theorem 1.2 in [2] for an EREW-PRAM.
486  /**
487  The allows the user to determine the current computational efficiency of their thread_pool with the supplied thread-safe adapted container, safe_colln, as they can use this to profile their code and adjust the size of the thread_pool for the target architecture.
488 
489  [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.
490  [2] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
491 
492  \return The minimum number of computations
493 
494  \todo It would be nice if there was some result for returning this with respect to the memory access models of the work within the queue (which may be a mix of CREW & EREW memory models) for the current thread_pool.
495 
496  \see safe_colln
497  */
498  virtual unsigned long __fastcall
499  min_time(generic_traits::memory_access_modes mode) const noexcept(true)=0;
500 
501  /// Return the theoretical minimum number of processors required to achieve the minimum computation time according to section 3.3 & Theorem 3.3 in [1] required to complete the current work using a CREW-PRAM.
502  /**
503  The allows the user to determine the current computational efficiency of their thread_pool with the supplied thread-safe adapted container, safe_colln, as they can use this to profile their code and adjust the size of the thread_pool for the target architecture.
504 
505  [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.
506 
507  \return The minimum number of processors
508 
509  \todo It would be nice if there was some result for returning this with respect to the memory access models of the work within the queue (which may be a mix of CREW & EREW memory models) for the current thread_pool.
510 
511  \see safe_colln
512  */
513  virtual unsigned long __fastcall
514  min_processors(generic_traits::memory_access_modes mode) const noexcept(true)=0;
515 
516  /// Transfer the priority to the pool, non-joinably.
517  /**
518  Verify that the closure_base-derived closure has not been previously transferred, if it has, throw an exception_type.
519 
520  \return A reference to the pool to allow chaining.
521 
522  \see priority, set_priority
523  */
524  template<priority_type Pri>
525  priority<Pri> __fastcall FORCE_INLINE
527  return priority<Pri>(*this);
528  }
529 
530  /// Non-joinably transfer the closure_base-derived closure into the thread_pool.
531  /**
532  \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.
533 
534  \see nonjoinable, joinable
535  */
536  nonjoinable __fastcall FORCE_INLINE
537  operator<<(nonjoinable &&nj) noexcept(true) {
538  return nonjoinable(std::forward<nonjoinable>(nj), *this);
539  }
540 
541  /// Non-joinably transfer the closure_base-derived closure into the thread_pool.
542  /**
543  \param njb Initialised with a suitable buffer for allocating the internal work items into.
544 
545  \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.
546 
547  \see nonjoinable_buff, algo_thread_wk_buffered
548  */
549  nonjoinable_buff __fastcall FORCE_INLINE
550  operator<<(nonjoinable_buff &&njb) noexcept(true) {
551  return nonjoinable_buff(std::forward<nonjoinable_buff>(njb), *this);
552  }
553 
554  /// Joinably transfer the closure_base-derived closure into the thread_pool.
555  /**
556  \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.
557 
558  \see joinable_t, nonjoinable_t
559  */
560  joinable_t<thread_pool_base> __fastcall FORCE_INLINE
561  operator<<(joinable_t<thread_pool_base> &&j) noexcept(true) {
562  return joinable_t<thread_pool_base>(std::forward<joinable_t<thread_pool_base>>(j), *this);
563  }
564 
565  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
566  /**
567  A read_lock_type is taken, and for each iterator in the range [c.begin(), c.end()) a copy of the std::unary_function f is applied. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of f, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable_t, or the client omits to construct the
568 execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
569 
570  Also this algorithm will be faster than a single-threaded algorithm if the cost of f(Colln::value_type) is more expensive than the threading costs in the pool.
571 
572  This algorithm makes at most 1 memory allocation.
573 
574  \param c An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
575  \param fn A std::unary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If fn is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.
576  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
577 
578  \see std::for_each()
579  \see subdivide_n_gen_wk1
580  \see for_each_t
581  \see execution_context
582  */
583  template<
584  class Colln,
585  typename Fn
586  > parallel_algorithm<for_each_t<Colln, Fn> > __fastcall FORCE_INLINE
587  for_each(Colln const &c, Fn const &fn);
588 
589  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
590  /**
591  A read_lock_type is taken, and for each iterator in the range [c.begin(), c.end()) a p(Colln::value_type) is performed and if the result is true, an atomic_counter_type is incremented. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of p, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is
592 nonjoinable_t, or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
593 
594  Also this algorithm will be faster than a single-threaded algorithm if the cost of p(Colln::value_type) is more expensive than the threading costs in the pool.
595 
596  \param c An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
597  \param p The predicate to use to count the matching values.
598  \return An execution_context that may be waited upon to determine when all of the operation is complete, and obtain the count.
599 
600  \see std::count_if()
601  \see subdivide_n_gen_wk1
602  \see count_if_t
603  \see safe_colln
604  \see execution_context
605  */
606  template<
607  class Colln,
608  class Pred
609  > parallel_algorithm<count_if_t<Colln, Pred> > __fastcall FORCE_INLINE
610  count_if(Colln const &c, Pred const &p);
611 
612  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
613  /**
614  \param c An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
615  \param v The value to find and be counted.
616  \return An execution_context that may be waited upon, and obtain the count.
617 
618  \see count_if()
619  \see execution_context
620  */
621  template<
622  class Colln
623  > parallel_algorithm<count_t<Colln> > __fastcall FORCE_INLINE
624  count(Colln const &c, typename Colln::value_type const &v);
625 
626  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
627  /**
628  A read_lock_type is taken, and for each iterator in the range [c.begin(), c.end()) a p(Colln::value_type) is performed and if the result is true, the element is found. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of p, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable_t, or the
629 client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
630 
631  Also this algorithm will be faster than a single-threaded algorithm if the cost of p(Colln::value_type) is more expensive than the threading costs in the pool.
632 
633  \param c An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. The collection should contain unique elements.
634  \param p The predicate to use to find any equivalent value.
635  \return An execution_context that may be waited upon to determine when all of the operation is complete, and obtain a boolean that indicates if there exists an element in the collection that is the same as the predicate.
636 
637  \see std::find_if()
638  \see subdivide_n_gen_wk1
639  \see find_if_t
640  \see safe_colln
641  \see execution_context
642  */
643  template<
644  class Colln,
645  class Pred
646  > parallel_algorithm<find_if_t<Colln, Pred> > __fastcall FORCE_INLINE
647  find_if(Colln const &c, Pred const &p);
648 
649  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
650  /**
651  \param c An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
652  \param v The value to find.
653  \return An execution_context that may be waited upon, and obtain the count.
654 
655  \see find_if()
656  \see execution_context
657  */
658  template<
659  class Colln
660  > parallel_algorithm<find_t<Colln> > __fastcall FORCE_INLINE
661  find(Colln const &c, typename Colln::value_type const &v);
662 
663  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
664  /**
665  A read_lock_type is taken in the input collection and a write_lock_type on the output collection. The output collection is resize()d to be big enough to take all elements from the input collection, then the write_lock_type is decayed to a read_lock_type, atomically. Then for each iterator in the range [in.begin(), in.end()) a op(CollnIn::value_type) is assigned to each element in the output collection. The application, input iterator and output iterator tuples are enqueued in the thread_pool for eventual application. The read_lock_types on all the collections are released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an
666 exception to be thrown that progresses beyond the stack frame of op, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable_t, or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
667 
668  Also this algorithm will be faster than a single-threaded algorithm if the cost of out.insert(op(CollnIn::value_type)) is more expensive than the threading costs in the pool.
669 
670  This algorithm makes at most 1 memory allocation.
671 
672  \param in An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
673  \param out The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and must have a recursive lock specified for it, otherwise an exception will be thrown, and optionally a decaying_write lock.
674  \param uniop The unary operator to apply to each element of in the output placed into out. The collection should support random-access iterators and optionally a decaying_write lock. The std::unary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If uniop is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.
675  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
676 
677  \see std::transform()
678  \see subdivide_n_gen_wk1
679  \see transform_t
680  \see safe_colln
681  \see execution_context
682  */
683  template<
684  typename CollnIn,
685  typename CollnOut,
686  class UniOp
687  > parallel_algorithm<transform_t<CollnIn, CollnOut, UniOp> > __fastcall FORCE_INLINE
688  transform(CollnIn const &in, CollnOut &out, UniOp const &uniop);
689 
690  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
691  /**
692  A read_lock_type is taken in the input collections and a write_lock_type on the output collection. The output collection is resize()d to be big enough to take all elements from the input collection, then the write_lock_type is decay()ed to a read_lock_type, atomically. Then for each iterator in the range [in1.begin(), in1.end()) a op(CollnIn1::value_type, CollnIn2::value_type) is assigned to each element in the output collection. The application, input iterators and output iterator tuples are enqueued in the thread_pool for eventual application. The read_lock_types on all the collections are released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an
693 application causes an exception to be thrown that progresses beyond the stack frame of op, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable_t, or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
694 
695  Also this algorithm will be faster than a single-threaded algorithm if the cost of out.insert(op(CollnIn1::value_type,CollnIn2::value_type)) is more expensive than the threading costs in the pool.
696 
697  This algorithm makes at most 1 memory allocation.
698 
699  \param in1 The first input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
700  \param in2 The second input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
701  \param out The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and must have a recursive lock specified for it, otherwise an exception will be thrown, and optionally a decaying_write lock.
702  \param binop The binary operator to apply to each element of in the output placed into out. A std::binary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If binop is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.
703  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
704 
705  \see std::transform()
706  \see subdivide_n_gen_wk1
707  \see transform2_t
708  \see safe_colln
709  \see execution_context
710  */
711  template<
712  typename CollnIn1,
713  typename CollnIn2,
714  typename CollnOut,
715  class BinOp
716  > parallel_algorithm<transform2_t<CollnIn1, CollnIn2, CollnOut, BinOp> > __fastcall FORCE_INLINE
717  transform(CollnIn1 const &in1, CollnIn2 const &in2, CollnOut &out, BinOp const &binop);
718 
719  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
720  /**
721  Also this algorithm will be faster than a single-threaded algorithm if the cost of out.insert(op(CollnIn::value_type)) is more expensive than the threading costs in the pool.
722 
723  This algorithm makes at most 1 memory allocation.
724 
725  \param in An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
726  \param out The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and optionally a decaying_write lock.
727  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
728 
729  \see std::copy()
730  \see transform()
731  */
732  template<
733  typename CollnIn,
734  typename CollnOut
735  > parallel_algorithm<transform_t<CollnIn, CollnOut, noop<typename CollnOut::value_type> > > __fastcall FORCE_INLINE
736  copy(CollnIn const &in, CollnOut &out);
737 
738  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
739  /**
740  A read_lock_type is taken, and for each iterator in the range [c.begin(), c.end()) a op(Colln::value_type, v) is performed and if the result is true, an atomic_counter_type is incremented. The application and iterator pair are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of op, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is
741 nonjoinable_t, or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
742 
743  Also this algorithm will be faster than a single-threaded algorithm if the cost of op(Colln::value_type, v) is more expensive than the threading costs in the pool.
744 
745  \param in An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
746  \param v The initial value, note that this must be an identity for the binary operation binop, otherwise the result will be an undefined value.
747  \param binop The binary operation to use.
748  \return An execution_context that may be waited upon to determine when all of the applications of op are complete, and obtain the result.
749 
750  \see std::accumulate()
751  */
752  template<
753  class Colln,
754  typename BinOp
755  > parallel_algorithm<accumulate_op_processor<Colln, BinOp> > __fastcall FORCE_INLINE
756  accumulate(Colln const &c, typename BinOp::result_type const &v, BinOp const &binop);
757 
758  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
759  /**
760  \param c An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
761  \param v The initial value, note that this must be an identity for the binary operator+(), otherwise the result will be an undefined value.
762  \return An execution_context that may be waited upon to determine when all of the applications of op are complete, and obtain the result.
763 
764  \see accumulate()
765  */
766  template<
767  class Colln,
768  class V
769  > parallel_algorithm<accumulate_processor<Colln, V>> __fastcall FORCE_INLINE
770  accumulate(Colln const &c, V const &v);
771 
772  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
773  /**
774  A write_lock_type is taken, the collection c is resized, the write_lock_type is decayed() to a read_lock_type, then for each iterator, iter, in the range [c.begin(), c.end()) the value v is assigned in some unspecified order. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all of the assignments have completed. It is unspecified if the assignments will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of the assignment, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor
775 of the execution_context. If the thread_pool is nonjoinable(), or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
776 
777  Also this algorithm will be faster than a single-threaded algorithm if the cost of *iter=Colln::value_type(v) is more expensive than the threading costs in the pool.
778 
779  This algorithm makes at most 1 memory allocation.
780 
781  \param c An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators.
782  \param sz The number of items to place into the collection.
783  \param v The value used to copy into the collection.
784  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
785 
786  \see std::fill_n()
787  \see subdivide_n_gen_wk1
788  \see fill_n_t
789  \see execution_context
790  */
791  template<
792  class Colln
793  > parallel_algorithm<fill_n_t<Colln> > __fastcall FORCE_INLINE
794  fill_n(Colln &c, typename Colln::size_type sz, typename Colln::value_type const &v);
795 
796  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
797  /**
798  A read_lock_type is taken then for each iterator, iter, in the range [c.begin(), c.end()) the value v is assigned in some unspecified order. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all of the assignments have completed. It is unspecified if the assignments will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of the assignment, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable(), or the client omits
799 to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
800 
801  Also this algorithm will be faster than a single-threaded algorithm if the cost of *iter=Colln::value_type(v) is more expensive than the threading costs in the pool.
802 
803  This algorithm makes at most 1 memory allocation.
804 
805  \param c An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators.
806  \param v The value used to copy into the collection.
807  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
808 
809  \see std::fill()
810  \see subdivide_n_gen_wk1
811  \see fill_t
812  \see execution_context
813  */
814  template<
815  class Colln
816  > parallel_algorithm<fill_t<Colln> > __fastcall FORCE_INLINE
817  fill(Colln &c, typename Colln::value_type const &v);
818 
819  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
820  /**
821  A read_lock_type is taken then the range is reversed, the iterator pairs that are swapped are done in some unspecified order. The read_lock_type on the collection is released when all of the assignments have completed. It is unspecified if the swaps will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all swaps are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of the assignment, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable(), or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is
822 unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.
823 
824  Also this algorithm will be faster than a single-threaded algorithm if the cost of the iter_swaps are more expensive than the threading costs in the pool.
825 
826  This algorithm makes at most 1 memory allocation.
827 
828  \param c An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support bidirectional-iterators.
829  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
830 
831  \see std::reverse()
832  \see subdivide_n_gen_wk1
833  \see reverse_t
834  \see execution_context
835  */
836  template<
837  typename Colln
838  > parallel_algorithm<reverse_t<Colln> > __fastcall FORCE_INLINE
839  reverse(Colln &c);
840 
841  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
842  /**
843  \param c An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
844  \param comp The comparator to use to compare the items.
845  \return An execution_context that may be waited upon to obtain the result, which is the smallest value in the collection, not an iterator to it.
846 
847  \see std::max_element()
848  \see max_element_t
849  \see execution_context
850  */
851  template<
852  class Colln,
853  class Comp
854  > parallel_algorithm<max_element_t<Colln, Comp>> __fastcall FORCE_INLINE
855  max_element(Colln const &c, Comp const &comp);
856 
857  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
858  /**
859  \param c An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
860  \return An execution_context that may be waited upon to obtain the result, which is the smallest value in the collection, not an iterator to it.
861 
862  \see std::max_element()
863  \see max_element_t
864  \see execution_context
865  */
866  template<
867  typename Colln
868  > parallel_algorithm<max_element_t<Colln, std::less<typename Colln::value_type>>> __fastcall FORCE_INLINE
869  max_element(Colln const &c);
870 
871  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
872  /**
873  \param c An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
874  \param comp The comparator to use to compare the items.
875  \return An execution_context that may be waited upon to obtain the result, which is the smallest value in the collection, not an iterator to it.
876 
877  \see std::min_element()
878  \see min_element_t
879  \see execution_context
880  */
881  template<
882  class Colln,
883  class Comp
884  > parallel_algorithm<min_element_t<Colln, Comp>> __fastcall FORCE_INLINE
885  min_element(Colln const &c, Comp const &comp);
886 
887  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
888  /**
889  \param c An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.
890  \return An execution_context that may be waited upon to obtain the result, which is the smallest value in the collection, not an iterator to it.
891 
892  \see std::min_element()
893  \see min_element_t
894  \see execution_context
895  */
896  template<
897  typename Colln
898  > parallel_algorithm<min_element_t<Colln, std::less<typename Colln::value_type>>> __fastcall FORCE_INLINE
899  min_element(Colln const &c);
900 
901  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
902  /**
903  Complexity: O(log^2(n.log(n)/p+log(p))) on average, if std::stable_sort() is used as the underlying sort algorithm, with enough memory.
904 
905  This implements Batcher's Bitonic Merge rather than Cole's parallel merge in section 5.1 pg 184 in [1] or in section 2.2.2 in [2] because of the results from [3].
906 
907  Take out a temporary read lock on the input collection and a write lock on the output collection to resize the output collection, so that it has the right number of iterators. Then take out read-locks on both, so that the write lock on the output collection is dropped, as we are now no longer modifying the actual collection, but the contents of each iterator.
908 
909  This algorithm makes at most O(min(log(p), log(n)) memory allocations.
910 
911  \param in1 The first input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.
912  \param in2 The second input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.
913  \param out The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and must have a recursive lock specified for it, otherwise an exception will be thrown, optionally a decaying_write lock.
914  \param comp A std::binary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If comp is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.
915  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
916 
917  [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.
918  [2] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
919  [3] Natvig, L., "Logarithmic time cost optimal parallel sorting is not yet fast in practice!", ACM/IEEE, 1990.
920 
921  \see std::merge, sort(), merge_t
922  */
923  template<
924  typename CollnIn1,
925  typename CollnIn2,
926  typename CollnOut,
927  class Compare
928  > parallel_algorithm<merge_t<CollnIn1, CollnIn2, CollnOut, Compare> > __fastcall FORCE_INLINE
929  merge(CollnIn1 const &in1, CollnIn2 const &in2, CollnOut &out, Compare const &comp);
930 
931  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
932  /**
933  \param in1 The first input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.
934  \param in2 The second input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.
935  \param out The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and optionally a decaying_write lock.
936  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
937 
938  \see merge()
939  */
940  template<
941  typename CollnIn1,
942  typename CollnIn2,
943  typename CollnOut
944  > parallel_algorithm<merge_t<CollnIn1, CollnIn2, CollnOut, std::less<typename CollnOut::value_type> > > __fastcall FORCE_INLINE
945  merge(CollnIn1 const &in1, CollnIn2 const &in2, CollnOut &out);
946 
947  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
948  /**
949  Complexity: O(log^2(n.log(n)/p+log(p))) on average, if std::stable_sort() is used as the underlying sort algorithm, with enough memory.
950 
951  This implements Batcher's Bitonic Merge rather than Cole's parallel merge in section 5.1 pg 184 in [1] or in section 2.2.2 in [2] because of the results from [3].
952 
953  Take out a temporary read lock on the input collection and a write lock on the output collection to resize the output collection, so that it has the right number of iterators. Then take out read-locks on both, so that the write lock on the output collection is dropped, as we are now no longer modifying the actual collection, but the contents of each iterator.
954 
955  This algorithm makes at most O((min(log(p), log(n))^2) memory allocations.
956 
957  \param c An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.
958  \param comp A comparison function. A std::binary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If comp is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.
959  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
960 
961  [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.
962  [2] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.
963  [3] Natvig, L., "Logarithmic time cost optimal parallel sorting is not yet fast in practice!", ACM/IEEE, 1990.
964 
965  \see std::sort()
966  \see merge()
967  \see sort_t
968  */
969  template<
970  class Colln,
971  typename Compare
972  > parallel_algorithm<sort_t<Colln, Compare> > __fastcall FORCE_INLINE
973  sort(Colln &c, Compare const &comp);
974 
975  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
976  /**
977  \param in An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.
978  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
979 
980  \see sort()
981  */
982  template<
983  typename Colln
984  > parallel_algorithm<sort_t<Colln, std::less<typename Colln::value_type> > > __fastcall FORCE_INLINE
985  sort(Colln &in);
986 
987  /// An adaptor to allow STL unary functions to be operated upon in the thread_pool.
988  /**
989  Note that the input is evaluated by transferring it into the pool, and the execution_context that holds the result has an automatic conversion to the result_type. Also note that using thread_wk_ts directly would be faster than using this, but this allows for pipelining of evaluation of the input argument then evaluation of the UniFn.
990 
991  \todo JMG: If there are no side-effects, one might implement compile-time reduction of boolean expressions implemented using logical_and() & logical_or(), see Kennedy & Allen, "Advanced Optimizing Compilers".
992 
993  \param a The parameter, that should have a mutator, process(), that has a result_type of boolean.
994  \param op The binary operation to apply to the input parameters, this also allows for potential of nesting binary_funs to form a control-flow tree.
995  \return An execution_context that may be waited upon to determine when all the two mutations are complete, and the result is available.
996 
997  \see thread_wk_t
998  \see closure_base
999  \see std::unary_fun()
1000  */
1001  template<
1002  class ArgT,
1003  class UniFn
1004  >
1005  execution_context_stack<unary_fun_work_type<ArgT, UniFn, thread_pool_base>> __fastcall FORCE_INLINE
1006  unary_fun(ArgT &&a, UniFn const &op=UniFn());
1007 
1008  /// An adaptor to allow STL binary functions to be operated upon in the thread_pool. Note that the inputs are evaluated by transferring them into the pool, and the execution_context that holds the result has an automatic conversion to the result_type.
1009  /**
1010  Please note that the order of completion of the arguments is undefined. Technically, the left-hand argument is enqueued then the right. The execution_context that holds the result is only released when both arguments have been evaluated. This guarantees that any exceptions thrown in the course of evaluating the arguments will be correctly rethrown by the execution_context. Also note that this functionality allows for pipelining of evaluation of the input arguments then evaluation of the BinFn.
1011 
1012  \todo JMG: If there are no side-effects, one might implement compile-time reduction of boolean expressions implemented using logical_and() & logical_or(), see Kennedy & Allen, "Advanced Optimizing Compilers".
1013 
1014  \param lhs A restricted thread_wk_t, one in which the external state is only read-only, that should have a mutator, process(), that has a result_type of boolean.
1015  \param rhs A restricted thread_wk_t, one in which the external state is only read-only, that should have a mutator, process(), that has a result_type of boolean.
1016  \param op The binary operation to apply to the input parameters, this also allows for potential of nesting binary_funs to form a control-flow tree.
1017  \return An execution_context that may be waited upon to determine when all the two mutations are complete, and the result is available.
1018 
1019  \see thread_wk_t
1020  \see closure_base
1021  \see std::binary_fun()
1022  */
1023  template<
1024  class LHSArg,
1025  class RHSArg,
1026  class BinFn
1027  >
1028  execution_context_stack<binary_fun_work_type<LHSArg, RHSArg, BinFn, thread_pool_base>> __fastcall FORCE_INLINE
1029  binary_fun(LHSArg &&lhs, RHSArg &&rhs, BinFn const &op=BinFn());
1030 
1031  /// A parallel implementation of the STL logical operation of the same name.
1032  /**
1033  Note that unlike the standard, this implementation does not short-circuit the operation, therefore side-effects in the second argument may occur in situations in which they would not have using the standard implementation. The order of evaluation of the arguments is undefined.
1034 
1035  \param lhs A functor that should be a restricted thread_wk_t, one in which the external state is only read-only, note that the process(bool &) function must be a const member-function, to assist in enforcing this requirement.
1036  \param rhs A functor that should be a restricted thread_wk_t, one in which the external state is only read-only, note that the process(bool &) function must be a const member-function, to assist in enforcing this requirement.
1037  \return An execution_context that may be waited upon to determine when all the two mutations are complete, and the result is available. As the inputs are pure, they have no side-effects, so the only way to detect any effect is via the execution_context, so this is by default joinable_t.
1038 
1039  \see binary_fun()
1040  \see thread_wk_t
1041  \see closure_base
1042  \see stl::logical_and()
1043  */
1044  template<
1045  class T
1046  >
1047  execution_context_stack<binary_fun_work_type<T const, T const, std::logical_and<bool>, thread_pool_base>> __fastcall FORCE_INLINE
1048  logical_and(T &&lhs, T &&rhs);
1049 
1050  /// A parallel implementation of the STL logical operation of the same name.
1051  /**
1052  Note that unlike the standard, this implementation does not short-circuit the operation, therefore side-effects in the second argument may occur in situations in which they would not have using the standard implementation. The order of evaluation of the arguments is undefined.
1053 
1054  \param lhs A functor that should be a restricted thread_wk_t, one in which the external state is only read-only, note that the process(bool &) function must be a const member-function, to assist in enforcing this requirement.
1055  \param rhs A functor that should be a restricted thread_wk_t, one in which the external state is only read-only, note that the process(bool &) function must be a const member-function, to assist in enforcing this requirement.
1056  \return An execution_context that may be waited upon to determine when the two mutations are complete, and the result is available. As the inputs are pure, they have no side-effects, so the only way to detect any effect is via the execution_context, so this is by default joinable_t.
1057 
1058  \see binary_fun()
1059  \see thread_wk_t
1060  \see closure_base
1061  \see stl::logical_or()
1062  */
1063  template<
1064  class T
1065  >
1066  execution_context_stack<binary_fun_work_type<T const, T const, std::logical_or<bool>, thread_pool_base>> __fastcall FORCE_INLINE
1067  logical_or(T &&lhs, T &&rhs);
1068 
1069  /**
1070  \todo Implement using the advice given in "Standard C++ IOStreams and Locales" by A.Langer & K.Kreft, page 170.
1071  */
1072  template<
1073  class DM1,
1074  pool_traits::size_mode_t Ps1,
1075  typename PTT1,
1076  class Pt1
1077  >
1078  friend tostream &__fastcall
1079  operator<<(tostream &os, thread_pool_base<DM1, Ps1, PTT1, Pt1> const &t);
1080 
1081  /// Access the control-flow graph, if supported.
1082  cfg_type & cfg() noexcept(true) FORCE_INLINE {
1083  return cfg_;
1084  }
1085  /// Access the control-flow graph, if supported.
1086  cfg_type const & cfg() const noexcept(true) FORCE_INLINE {
1087  return cfg_;
1088  }
1089 
1090  /// An adaptor for the parallel equivalent of the STL algorithm of the same name. Calls std::iter_swap(i, j) for the (potentially overlapping) ranges i in [b1, e1) & j in [b2, b2+(e1-b1)) for each iterator pair (i, j) where p(*i, *j)==true.
1091  /**
1092  Note that this algorithm is an advanced algorithm: no locking of the underlying collection(s) to which the iterators access are taken, so if the iterator(s) is(are) invalidated or if the output range is not large enough, then UB will occur.
1093 
1094  Also this algorithm will be faster than a single-threaded algorithm if the cost of evaluating "if (p(iterator::value_type const &, iterator::value_type const &)) std::iter_swap()" is more expensive than the threading costs in the pool.
1095 
1096  This algorithm makes at most 1 memory allocation.
1097 
1098  \param b1 The input-iterator to the start of the first range.
1099  \param e1 The output-iterator to end of the first range.
1100  \param b2 The input-iterator to the start of the second range.
1101  \param p A binary-function predicate that returns bool and takes two arguments of type iterator::value_type. A std::binary_function predicate-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::swap_ranges(). If p is re-entrant and is thread-safe, then, given the particular nature of that predicate, the library may be able to maintain its thread-safety guarantees.
1102  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
1103 
1104  \see std::swap_ranges()
1105  \see subdivide_n_gen_wk1
1106  \see swap_ranges_t
1107  \see execution_context
1108  */
1109  template<
1110  class Colln,
1111  typename Pred
1112  > parallel_algorithm<swap_ranges_t<Colln, Pred> > __fastcall FORCE_INLINE
1113  swap_ranges(typename Colln::container_type::iterator b1, typename Colln::container_type::iterator e1, typename Colln::container_type::iterator b2, Pred const &p);
1114 
1115  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
1116  /**
1117  Note that this algorithm is an advanced algorithm: no locking of the underlying collection(s) to which the iterators access are taken, so if the iterator(s) is(are) invalidated or if the output range is not large enough, then UB will occur.
1118 
1119  Also this algorithm will be faster than a single-threaded algorithm if the cost of out.insert(op(CollnIn::value_type)) is more expensive than the threading costs in the pool.
1120 
1121  This algorithm makes at most 1 memory allocation.
1122 
1123  \param b1 The input-iterator to the start of the first range.
1124  \param e1 The output-iterator to end of the first range.
1125  \param b2 The input-iterator to the start of the second range.
1126  \param uniop A std::unary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If f is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.
1127  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
1128 
1129  \see copy(), transform()
1130  \see transform_t
1131  */
1132  template<
1133  typename CollnIn,
1134  typename CollnOut,
1135  class IterIn,
1136  class UniOp
1137  > parallel_algorithm<transform_iter_t<CollnIn, CollnOut, IterIn, UniOp> > __fastcall FORCE_INLINE
1138  transform(IterIn b1, IterIn e1, typename CollnOut::container_type::iterator b2, UniOp const &uniop);
1139 
1140  /// An adaptor for the parallel equivalent of the STL algorithm of the same name.
1141  /**
1142  Note that this algorithm is an advanced algorithm: no locking of the underlying collection(s) to which the iterators access are taken, so if the iterator(s) is(are) invalidated or if the output range is not large enough, then UB will occur.
1143 
1144  Also this algorithm will be faster than a single-threaded algorithm if the cost of copying each element is more expensive than the threading costs in the pool.
1145 
1146  This algorithm makes at most 1 memory allocation.
1147 
1148  \param b1 The input-iterator to the start of the first range.
1149  \param e1 The output-iterator to end of the first range.
1150  \param b2 The input-iterator to the start of the second range.
1151  \return An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.
1152 
1153  \see copy(), transform()
1154  \see transform_t
1155  */
1156  template<
1157  class CollnIn,
1158  class CollnOut,
1159  class IterIn
1160  > parallel_algorithm<copy_iter_t<CollnIn, CollnOut, IterIn> > __fastcall FORCE_INLINE
1161  copy(IterIn b1, IterIn e1, typename CollnOut::container_type::iterator b2);
1162 
1163 protected:
1164  cfg_type cfg_;
1165 
1166  explicit thread_pool_base(const pool_size_type max_num_threads) noexcept(false) FORCE_INLINE
1167  : max_num_threads_in_pool(max_num_threads) {
1168  thread_traits::set_backtrace_on_signal();
1169  thread_traits::set_backtrace_on_terminate();
1170  }
1171 
1172  /// Obtain access to any statistics data collected by the operation of the thread_pool.
1173  virtual statistics_type &__fastcall set_statistics() noexcept(true)=0;
1174 
1175  virtual signalled_work_queue_type & __fastcall queue() noexcept(true)=0;
1176  virtual signalled_work_queue_type const & __fastcall queue() const noexcept(true)=0;
1177 
1178  virtual void __fastcall add_nonjoinable_work(typename signalled_work_queue_type::value_type &&) noexcept(false)=0;
1179  virtual typename signalled_work_queue_type::value_type __fastcall add_joinable_work(typename signalled_work_queue_type::value_type &&) noexcept(false)=0;
1180 
1181  /**
1182  \return True if there is more closure_base-derived closure to process() in the batch, otherwise false.
1183  */
1184  virtual bool __fastcall process_a_batch_item(const typename thread_traits::api_params_type::tid_type, typename os_traits::thread_exception const &) noexcept(false) FORCE_INLINE {
1185  return false;
1186  }
1187 
1188  template<class ExecCtx>
1189  typename ExecCtx::chk_argument_type __fastcall FORCE_INLINE
1190  make_arg(typename signalled_work_queue_type::value_type &&async_wk) {
1191  return ExecCtx::template make_arg<typename ExecCtx::result_type>(
1192  std::forward<typename signalled_work_queue_type::value_type>(async_wk),
1193  this
1194  );
1195  }
1196 
1197  virtual typename pool_traits_type::template exit_requested_type<typename work_distribution_mode::queue_model> &exit_requested() noexcept(true)=0;
1198 
1199 private:
1200  template<class TPB> friend class joinable_t;
1201  template<class TPB> friend class nonjoinable_t;
1202  template<class TPB> friend class nonjoinable_buff_t;
1203  template<template<class> class Joinability, class TPB, typename TPB::priority_type Pri> friend class priority_t;
1204  template<class DM1, generic_traits::return_data RD, class TPB, class Wk> friend class execution_context_stack_type;
1205  template<class DM1, generic_traits::return_data RD, class TPB, template<class, class, template<class> class, template<class> class> class CoreWk, class AlgoWrapT, class Wk> friend class execution_context_algo_stack_type;
1206  template<generic_traits::return_data RD, class TPB, template<class> class Del, template<class> class AtCtr> friend class horizontal_execution;
1207 };
1208 
1209 } } } }
1210 
1212 
1213 #endif