doxygen/html/thread__pool__base_8hpp_source.html

#ifndef LIBJMMCG_CORE_PRIVATE_THREAD_POOL_BASE_HPP

#define LIBJMMCG_CORE_PRIVATE_THREAD_POOL_BASE_HPP


/******************************************************************************

** Copyright © 2010 by J.M.McGuiness, coder@hussar.me.uk

**

** This library is free software; you can redistribute it and/or

** modify it under the terms of the GNU Lesser General Public

** License as published by the Free Software Foundation; either

** version 2.1 of the License, or (at your option) any later version.

**

** This library is distributed in the hope that it will be useful,

** but WITHOUT ANY WARRANTY; without even the implied warranty of

** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

** Lesser General Public License for more details.

**

** You should have received a copy of the GNU Lesser General Public

** License along with this library; if not, write to the Free Software

** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

*/


#include "parallel_algorithms.hpp"

#include "../../core/thread_pool_aspects.hpp"

#include "subdivide_n_gen_wk.hpp"

#include "../../core/thread_statistics.hpp"


namespace jmmcg { namespace LIBJMMCG_VER_NAMESPACE { namespace ppd { namespace private_ {


static constexpr char const node_details_acc_op[]="accumulate_op";

static constexpr char const node_details_acc[]="accumulate";

static constexpr char const max_element_str[]="max_element";

static constexpr char const min_element_str[]="min_element";


template<class V, class>

struct remove_shared_ptr {

   typedef V value_type;

};

template<class V, class LkT>

struct remove_shared_ptr<shared_ptr<V, LkT>, LkT> {

   typedef V value_type;

};


/// Base class for all multi-threaded thread_pools.

/**

   thread_pool_base is for a finite-sized pool of threads (possibly large). You transfer work to the pool, and may then obtain the results of that work via an execution_context class that allows you to get the result (automatically waiting if necessary for the work to be processed). The closure_base-derived closure is processed in the order in which it is transferred to the pool. When the pool goes out of scope, all threads within the pool are given a grace period to end their work, and if necessary those unfinished threads are terminated.

   Note that the work scheduling technique in these pools is the baker's ticket scheduling algorithm. Because the thread_wk_t in the queue is owned by the queue and the execution_contexts are strictly automatic variables, the current list of work in the queue is guaranteed to be strictly independent of each other. This is an important result: it means that the thread_pool is free to re-schedule the work in any way it chooses, and the program correctness is guaranteed.


   Notes:

   ======

   1. Both thread_pool and thread_wk_t have various traits. These specialise the pool implementation for various architectural features. e.g. Win32 & Posix threads are very slow to create, synchronise & destroy. For other architectures this may not be the case. Appropriate specialisations may be provided.

   2. Currently the implementation is biassed towards non-cellular threading, or "heavyweight" threading. This may not be the case in the future.

   3. More traits on the pool allow for alternative work-sharing techniques. Currently in the statically sized pool (pool_traits::work_distribution_mode_t::worker_threads_get_work) the workers steal work from the signalled_work_queue. Alternatively for a master-slave approach (pool_traits::work_distribution_mode_t::one_thread_distributes<>) in the thread_wk_t class, the work is assigned to threads via the "main" thread that performs the work assignment. Both of these techniques have their limitations, but this may not be the case in the future.

   4. std::remove() has not been implemented because it returns an iterator to the partitioned collection. Iterators into collections have difficulties in a multi-threaded environment because the iterator shouldn't be invalidated by subsequent operations on the collection, and the order of these subsequent operations is hard to predict in a multi-threaded environment. In fact PPD eschews iterators and uses the collections directly, which is effectively a "range" instead, for those reasons.


   Example usage:

   ==============

   See the examples directory.

*/

template<

   class DM,   ///< The work distribution mode, selected from the types in the work_distribution_mode_t namespace.

   pool_traits::size_mode_t Ps,  ///< The size model of the pool, selected from size_mode_t.

   typename PTT,  ///< The pool_aspects that contains many more properties of the thread_pool.

   class Pt ///< Not for the user. The container for the pool_threads, used internally by the library.

>

class thread_pool_base;


template<

   class DM,

   pool_traits::size_mode_t Ps,

   typename PTT,

   class Pt

>

class thread_pool_base {

private:

   /**

      To avoid a race condition in executing the first task by a pool_thread and calling execution_context::get_results().

   */

   static constexpr unsigned short init_num_jobs_par_alg=1;

   static constexpr unsigned short init_num_jobs_par_alg_other=0;

   /**

      In parallel algorithms we use the full range of the input collection by default.

   */

   static constexpr unsigned short default_num_subranges=1;


public:

   using pool_traits_type=PTT;

   using pool_type=Pt;

   using pool_size_type=typename pool_type::size_type;

   using pool_thread_type=typename remove_shared_ptr<typename pool_type::value_type, api_lock_traits<platform_api, sequential_mode>>::value_type;

   /// A useful typedef to easily get to the various OS traits.

   using os_traits=typename pool_traits_type::os_traits;

   /// All exceptions thrown by this class are of this type, or derived from it.

   /**

      The exception class in this class is called "exception". This is, naturally, eventually derived from "std::exception".


      \see exception_type

   */

   using exception_type=typename os_traits::exception_type;

   /// A useful typedef to easily get to the various OS specific thread-traits.

   using thread_traits=typename os_traits::thread_traits;

   /// A useful typedef to easily get to the various API details.

   using api_params_type=typename thread_traits::api_params_type;

   /// A useful typedef to easily get to the various priorities.

   using priority_type=typename api_params_type::priority_type;

   using work_distribution_mode=DM;

   using pool_thread_queue_details=typename pool_traits_type::template pool_thread_queue_details<typename work_distribution_mode::queue_model>;

   using statistics_type=typename pool_thread_queue_details::statistics_type;

   using signalled_work_queue_type=typename pool_traits_type::template signalled_work_queue_type<typename work_distribution_mode::queue_model>;

   using queue_size_type=typename signalled_work_queue_type::size_type;


   static constexpr pool_traits::size_mode_t size_mode=Ps;

   /**

      To assist in allowing compile-time computation of the algorithmic order of the threading model.

   */

   static constexpr generic_traits::memory_access_modes memory_access_mode=((pool_traits_type::template thread_pool_queue_details<typename work_distribution_mode::queue_model>::memory_access_mode==generic_traits::memory_access_modes::crew_memory_access && pool_traits_type::template pool_thread_queue_details<typename work_distribution_mode::queue_model>::memory_access_mode==generic_traits::memory_access_modes::crew_memory_access) ? generic_traits::memory_access_modes::crew_memory_access : generic_traits::memory_access_modes::erew_memory_access);


   /// The type of the control-flow graph that will be generated at run-time, if supported.

   /**

      \see cfg(), dummy_control_flow_graph, control_flow_graph

   */

   using cfg_type=typename pool_traits_type::cfg_type;

   /// A useful typedef to easily get to the nonjoinable grammar element.

   /**

      \see nonjoinable_t

   */

   using nonjoinable=nonjoinable_t<thread_pool_base>;

   /// A useful typedef to easily get to the joinable grammar element.

   /**

      \see joinable_t

   */

   using joinable=joinable_t<thread_pool_base>;

   /// A useful typedef to easily get to the nonjoinable_buff grammar element.

   /**

      Used internally be the library.


      \see nonjoinable_buff_t

   */

   using nonjoinable_buff=nonjoinable_buff_t<thread_pool_base>;

   /**

      Used internally be the library.

   */

   template<priority_type Pri> struct priority {};

   /// This is a useful typedef to get at the execution_context. The work is allocated on the stack, inside this type.

   /**

      Used internally be the library.


      The execution_context is created by joinably transferring work into the pool. It has various uses, but is primarily used to atomically and synchronously wait on the results of the work on the closure_base-derived closure-derived object, as specified by the thread_wk_t object transferred into the pool. But it can also pass back specified exceptions that may be thrown by the work. It can also be used to asynchronously test if the work has been completed, and delete the work from the pool, if it has not been started.


      \see create_direct

      \see execution_context_stack_type

      \see joinable_t

      \see closure_base

   */

   template<class InpWk>

   struct execution_context_stack;

   /// Used by the library to implicitly generate a closure from the InpWk type.

   /**

      Used internally be the library.

   */

   template<

      typename InpWk,   ///< The closure_base-derived closure type. The result_type is inferred from the process(result_type &) or process() member-functions declared in the Wk type. Note that the process() member-function must not be overloaded, or this will not work, also that it must use the __fastcall calling-convention on those platforms that support it.

      class PtrFnType=decltype(&std::remove_reference<InpWk>::type::process)  ///< The default mutator function is called process(), but you could provide an alternative member-function name if desired, as long as the signature is correct. Note that exception-specifications are not supported at all, including "throw()", but exception-declarations, such as noexcept are supported.

   >

   struct create_direct;


private:

   /**

      Used internally be the library.

   */

   typedef stl_functor_result_type<bool> boolean_result_type;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see for_each(), for_each_work_type, for_each_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln,   ///< The adapted collection to iterate over.

      class Fn

   >

   class for_each_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see count_if(), counter, count_if_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln,   ///< The adapted collection to search.

      typename Pred  ///< The predicate to be used to find the value to be counted.

   >

   class count_if_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see count(), counter, count_if_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln ///< The adapted collection to search.

   >

   struct count_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see find_if(), countor_work_type, find_if_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln,   ///< The adapted collection to search.

      typename Pred  ///< The predicate to be used to find the value to be counted.

   >

   class find_if_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see find(), countor_work_type, find_if_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln ///< The adapted collection to search.

   >

   struct find_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see transform(), for_each_work_type, subdivide_n_gen_wk2

   */

   template<

      typename CollnIn, ///< The adapted collection to transform.

      typename CollnOut,   ///< The adapted collection to output into.

      typename UniOp

   >

   class transform_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see transform(), transform_t

   */

   template<

      typename CollnIn, ///< The adapted collection to transform.

      typename CollnOut,   ///< The adapted collection to output into.

      class IterIn,  ///< The iterator type to be used to iterate over the collections.

      typename UniOp ///< The unary operator to apply to the input & generate the output.

   >

   class transform_iter_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see transform(), for_each_work_type, transform2_reduce, subdivide_n_gen_wk3

   */

   template<

      typename CollnIn1,   ///< The adapted collection to transform.

      typename CollnIn2,   ///< The adapted collection to transform.

      typename CollnOut,   ///< The adapted collection to output into.

      typename BinOp ///< The binary operator to apply to the input & generate the output.

   >

   class transform2_t;


   /// We use a functor to initialise the value in map_reduce_t, to ensure that it is run whilst locks are taken out on the container, to avoid race-conditions.

   /**

      Used internally be the library.

   */

   template<class Res>

   struct map_red_initialiser;

   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see accumulator_work_type, subdivide_n_gen_wk1

   */

   template<

      class Colln,   ///< The adapted collection to search.

      typename BinOp,   ///< The binary functor to be used to accumulate the result.

      class V, ///< The result-type to be returned by the initialiser, to avoid problems with explicit conversions being required.

      template<class, class> class Reduce,   ///< The reduction operation to perform.

      class Init  ///< The initialiser to be used to initialise the result value.

   >

   class map_reduce_t;


   /// We use a functor to initialise the value in map_reduce_t, to ensure that it is run whilst locks are taken out on the container, to avoid race-conditions.

   /**

      Used internally be the library.


      \see max_element_t

   */

   template<

      class Colln ///< The adapted collection to searched.

   >

   struct max_element_initialiser;

   /// We use a functor to initialise the value in map_reduce_t, to ensure that it is run whilst locks are taken out on the container, to avoid race-conditions.

   /**

      Used internally be the library.


      \see min_element_t

   */

   template<

      class Colln ///< The adapted collection to searched.

   >

   struct min_element_initialiser;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see accumulate(), accumulate_processor, accumulate_reduce

   */

   template<

      class Colln,   ///< The adapted collection to searched.

      class BinOp ///< The binary functor to be used to accumulate the result.

   >

   struct accumulate_op_processor;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see accumulate(), accumulate_op_processor, accumulate_reduce

   */

   template<

      class Colln,   ///< The adapted collection to search.

      class V  ///< The result-type to be returned by the initialiser, to avoid problems with explicit conversions being required.

   >

   struct accumulate_processor;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see map_reduce_t, max_element_initialiser, alg_wk_wrap::max_element_reduce

   */

   template<

      class Colln,   ///< The adapted collection to search.

      class Comp=std::less<typename Colln::value_type>   ///< The comparator to use to compare the items.

   >

   struct max_element_t;

   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see map_reduce_t, min_element_initialiser, alg_wk_wrap::min_element_reduce

   */

   template<

      class Colln,   ///< The adapted collection to search.

      class Comp=std::less<typename Colln::value_type>   ///< The comparator to use to compare the items.

   >

   struct min_element_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see fill_n(), pass_value, fill_n_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln ///< The adapted collection to fill.

   >

   class fill_n_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see fill(), pass_value, fill_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln ///< The adapted collection to fill.

   >

   class fill_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see fill(), pass_value, fill_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln ///< The collection to reverse.

   >

   class reverse_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see merge(), subdivide_n_gen_wk2

   */

   template<

      typename CollnIn1,   ///< The adapted collection to merge.

      typename CollnIn2,   ///< The adapted collection to merge.

      typename CollnOut,   ///< The adapted collection to output into.

      typename Compare  ///< The binary functor to be used as the comparator to order the items in the output.

   >

   class merge_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see sort(), subdivide_n_gen_wk1

   */

   template<

      class Colln,   ///< The adapted collection to sort.

      typename Compare  ///< The binary functor to be used as the comparator to order the items in the output.

   >

   class sort_t;


public:

   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see transform(), transform_t

      transform_iter_t<CollnIn, CollnOut, IterIn, noop<typename CollnOut::value_type>

   */

   template<

      typename CollnIn, ///< The adapted collection to be copied.

      typename CollnOut,   ///< The adapted collection to output into.

      class IterIn   ///< The iterator type to be used to iterate over the collections.

   >

   struct copy_iter_t;


   /// A modifier to allow joinably transferring the work to the pool.

   /**

      Used internally be the library.


      \see swap_ranges(), for_each_work_type, swap_ranges_reduce, subdivide_n_gen_wk1

   */

   template<

      class Colln,   ///< The adapted collection to swap the elements.

      class Pred  ///< The binary functor to be used as the comparator to order the items in the output.

   >

   class swap_ranges_t;


   const pool_size_type max_num_threads_in_pool;


   thread_pool_base(thread_pool_base const &)=delete;

   /**

      Note that it is recommended that the thread_pool is created on the stack, within main(). Instantiating a thread_pool outside main(), for example as a singleton, is possible and should work, but orderly destruction of the thread_pool is not guaranteed outside main(), and is likely to cause an abort() or core to be dumped. If a singleton outside main() must be used it is recommended to allocate the thread_pool on the heap, and leak it at exit, hoping the OS will clean up all resources. Currently, the thread_pool does not use any inter-process shared memory or global named objects (but this is not guaranteed in the future), so in principle the OS should be able to recover all that would be leaked.

   */

   virtual __stdcall ~thread_pool_base() noexcept(false) FORCE_INLINE {

   }


   /// Returns true if there no threads in the thread_pool.

   /**

      \return  true if there no threads in the thread_pool.

   */

   virtual bool __fastcall pool_empty() const noexcept(true)=0;

   /// Returns the current number of threads in the thread_pool.

   /**

      \return  The current number of threads in the thread_pool.

   */

   virtual const pool_size_type __fastcall pool_size() const noexcept(true)=0;

   /// Returns true if there is no work to process by the thread_pool.

   /**

      \return  true if there is no work to process by the thread_pool.

   */

   virtual bool __fastcall queue_empty() const noexcept(true)=0;

   /// Returns the current amount of outstanding, unscheduled work items to be processed by the thread_pool.

   /**

      \return  The current amount of outstanding, unscheduled work items to be processed by the thread_pool.

   */

   virtual const queue_size_type __fastcall queue_size() const noexcept(true)=0;


   /// Remove all of the outstanding tasks from the thread pool.

   /**

      Note that this may cause deadlocks if the tasks are inter-dependent.

   */

   virtual void __fastcall queue_clear() noexcept(false)=0;


   /// Obtain access to any statistics data collected by the operation of the thread_pool.

   virtual statistics_type const __fastcall statistics() const =0;


   /// Return the theoretical minimum time in computations according to section 3.3 & Theorem 3.3 in [1] required to complete the current work with the current number of threads in the pool using a CREW-PRAM and according to section 1.3.2, Theorem 1.2 in [2] for an EREW-PRAM.

   /**

      The allows the user to determine the current computational efficiency of their thread_pool with the supplied thread-safe adapted container, safe_colln, as they can use this to profile their code and adjust the size of the thread_pool for the target architecture.


      [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.

      [2] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.


      \return  The minimum number of computations


      \todo It would be nice if there was some result for returning this with respect to the memory access models of the work within the queue (which may be a mix of CREW & EREW memory models) for the current thread_pool.


      \see safe_colln

   */

   virtual unsigned long __fastcall

   min_time(generic_traits::memory_access_modes mode) const noexcept(true)=0;


   /// Return the theoretical minimum number of processors required to achieve the minimum computation time according to section 3.3 & Theorem 3.3 in [1] required to complete the current work using a CREW-PRAM.

   /**

      The allows the user to determine the current computational efficiency of their thread_pool with the supplied thread-safe adapted container, safe_colln, as they can use this to profile their code and adjust the size of the thread_pool for the target architecture.


      [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.


      \return  The minimum number of processors


      \todo It would be nice if there was some result for returning this with respect to the memory access models of the work within the queue (which may be a mix of CREW & EREW memory models) for the current thread_pool.


      \see safe_colln

   */

   virtual unsigned long __fastcall

   min_processors(generic_traits::memory_access_modes mode) const noexcept(true)=0;


   /// Transfer the priority to the pool, non-joinably.

   /**

      Verify that the closure_base-derived closure has not been previously transferred, if it has, throw an exception_type.


      \return     A reference to the pool to allow chaining.


      \see priority, set_priority

   */

   template<priority_type Pri>

   priority<Pri> __fastcall FORCE_INLINE

   push_back(priority<Pri>) {

      return priority<Pri>(*this);

   }


   /// Non-joinably transfer the closure_base-derived closure into the thread_pool.

   /**

      \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


      \see nonjoinable, joinable

   */

   nonjoinable __fastcall FORCE_INLINE

   operator<<(nonjoinable &&nj) noexcept(true) {

      return nonjoinable(std::forward<nonjoinable>(nj), *this);

   }


   /// Non-joinably transfer the closure_base-derived closure into the thread_pool.

   /**

      \param njb  Initialised with a suitable buffer for allocating the internal work items into.


      \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


      \see nonjoinable_buff, algo_thread_wk_buffered

   */

   nonjoinable_buff __fastcall FORCE_INLINE

   operator<<(nonjoinable_buff &&njb) noexcept(true) {

      return nonjoinable_buff(std::forward<nonjoinable_buff>(njb), *this);

   }


   /// Joinably transfer the closure_base-derived closure into the thread_pool.

   /**

      \todo JMG: Hubert Matthews suggested that potentially expression templates could be used here to concatenate the thread_wk_t's that are transferred into the pool; also as an implementation of back_batching, i.e. GSS(k) scheduling.


      \see joinable_t, nonjoinable_t

   */

   joinable_t<thread_pool_base> __fastcall FORCE_INLINE

   operator<<(joinable_t<thread_pool_base> &&j) noexcept(true) {

      return joinable_t<thread_pool_base>(std::forward<joinable_t<thread_pool_base>>(j), *this);

   }


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A read_lock_type is taken, and for each iterator in the range [c.begin(), c.end()) a copy of the std::unary_function f is applied. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of f, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable_t, or the client omits to construct the

execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of f(Colln::value_type) is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param   c  An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   fn A std::unary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If fn is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see std::for_each()

      \see subdivide_n_gen_wk1

      \see for_each_t

      \see execution_context

   */

   template<

      class Colln,

      typename Fn

   > parallel_algorithm<for_each_t<Colln, Fn> > __fastcall FORCE_INLINE

   for_each(Colln const &c, Fn const &fn);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A read_lock_type is taken, and for each iterator in the range [c.begin(), c.end()) a p(Colln::value_type) is performed and if the result is true, an atomic_counter_type is incremented. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of p, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is

nonjoinable_t, or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of p(Colln::value_type) is more expensive than the threading costs in the pool.


      \param   c  An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   p  The predicate to use to count the matching values.

      \return  An execution_context that may be waited upon to determine when all of the operation is complete, and obtain the count.


      \see std::count_if()

      \see subdivide_n_gen_wk1

      \see count_if_t

      \see safe_colln

      \see execution_context

   */

   template<

      class Colln,

      class Pred

   > parallel_algorithm<count_if_t<Colln, Pred> > __fastcall FORCE_INLINE

   count_if(Colln const &c, Pred const &p);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   c  An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   v  The value to find and be counted.

      \return  An execution_context that may be waited upon, and obtain the count.


      \see count_if()

      \see execution_context

   */

   template<

      class Colln

   > parallel_algorithm<count_t<Colln> > __fastcall FORCE_INLINE

   count(Colln const &c, typename Colln::value_type const &v);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A read_lock_type is taken, and for each iterator in the range [c.begin(), c.end()) a p(Colln::value_type) is performed and if the result is true, the element is found. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of p, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable_t, or the

client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of p(Colln::value_type) is more expensive than the threading costs in the pool.


      \param   c  An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. The collection should contain unique elements.

      \param   p  The predicate to use to find any equivalent value.

      \return  An execution_context that may be waited upon to determine when all of the operation is complete, and obtain a boolean that indicates if there exists an element in the collection that is the same as the predicate.


      \see std::find_if()

      \see subdivide_n_gen_wk1

      \see find_if_t

      \see safe_colln

      \see execution_context

   */

   template<

      class Colln,

      class Pred

   > parallel_algorithm<find_if_t<Colln, Pred> > __fastcall FORCE_INLINE

   find_if(Colln const &c, Pred const &p);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   c  An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   v  The value to find.

      \return  An execution_context that may be waited upon, and obtain the count.


      \see find_if()

      \see execution_context

   */

   template<

      class Colln

   > parallel_algorithm<find_t<Colln> > __fastcall FORCE_INLINE

   find(Colln const &c, typename Colln::value_type const &v);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A read_lock_type is taken in the input collection and a write_lock_type on the output collection. The output collection is resize()d to be big enough to take all elements from the input collection, then the write_lock_type is decayed to a read_lock_type, atomically. Then for each iterator in the range [in.begin(), in.end()) a op(CollnIn::value_type) is assigned to each element in the output collection. The application, input iterator and output iterator tuples are enqueued in the thread_pool for eventual application. The read_lock_types on all the collections are released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an

exception to be thrown that progresses beyond the stack frame of op, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable_t, or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of out.insert(op(CollnIn::value_type)) is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param   in An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   out   The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and must have a recursive lock specified for it, otherwise an exception will be thrown, and optionally a decaying_write lock.

      \param   uniop The unary operator to apply to each element of in the output placed into out. The collection should support random-access iterators and optionally a decaying_write lock. The std::unary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If uniop is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see std::transform()

      \see subdivide_n_gen_wk1

      \see transform_t

      \see safe_colln

      \see execution_context

   */

   template<

      typename CollnIn,

      typename CollnOut,

      class UniOp

   > parallel_algorithm<transform_t<CollnIn, CollnOut, UniOp> > __fastcall FORCE_INLINE

   transform(CollnIn const &in, CollnOut &out, UniOp const &uniop);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A read_lock_type is taken in the input collections and a write_lock_type on the output collection. The output collection is resize()d to be big enough to take all elements from the input collection, then the write_lock_type is decay()ed to a read_lock_type, atomically. Then for each iterator in the range [in1.begin(), in1.end()) a op(CollnIn1::value_type, CollnIn2::value_type) is assigned to each element in the output collection. The application, input iterators and output iterator tuples are enqueued in the thread_pool for eventual application. The read_lock_types on all the collections are released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an

application causes an exception to be thrown that progresses beyond the stack frame of op, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable_t, or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of out.insert(op(CollnIn1::value_type,CollnIn2::value_type)) is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param   in1   The first input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   in2   The second input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   out   The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and must have a recursive lock specified for it, otherwise an exception will be thrown, and optionally a decaying_write lock.

      \param   binop The binary operator to apply to each element of in the output placed into out. A std::binary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If binop is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see std::transform()

      \see subdivide_n_gen_wk1

      \see transform2_t

      \see safe_colln

      \see execution_context

   */

   template<

      typename CollnIn1,

      typename CollnIn2,

      typename CollnOut,

      class BinOp

   > parallel_algorithm<transform2_t<CollnIn1, CollnIn2, CollnOut, BinOp> > __fastcall FORCE_INLINE

   transform(CollnIn1 const &in1, CollnIn2 const &in2, CollnOut &out, BinOp const &binop);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      Also this algorithm will be faster than a single-threaded algorithm if the cost of out.insert(op(CollnIn::value_type)) is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param   in An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   out   The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and optionally a decaying_write lock.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see std::copy()

      \see transform()

   */

   template<

      typename CollnIn,

      typename CollnOut

   > parallel_algorithm<transform_t<CollnIn, CollnOut, noop<typename CollnOut::value_type> > > __fastcall FORCE_INLINE

   copy(CollnIn const &in, CollnOut &out);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A read_lock_type is taken, and for each iterator in the range [c.begin(), c.end()) a op(Colln::value_type, v) is performed and if the result is true, an atomic_counter_type is incremented. The application and iterator pair are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all the applications have completed. It is unspecified if the applications will complete before the function returns. For a joinable thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of op, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is

nonjoinable_t, or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of op(Colln::value_type, v) is more expensive than the threading costs in the pool.


      \param   in An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   v  The initial value, note that this must be an identity for the binary operation binop, otherwise the result will be an undefined value.

      \param   binop The binary operation to use.

      \return  An execution_context that may be waited upon to determine when all of the applications of op are complete, and obtain the result.


      \see std::accumulate()

   */

   template<

      class Colln,

      typename BinOp

   > parallel_algorithm<accumulate_op_processor<Colln, BinOp> > __fastcall FORCE_INLINE

   accumulate(Colln const &c, typename BinOp::result_type const &v, BinOp const &binop);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   c  An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   v  The initial value, note that this must be an identity for the binary operator+(), otherwise the result will be an undefined value.

      \return  An execution_context that may be waited upon to determine when all of the applications of op are complete, and obtain the result.


      \see accumulate()

   */

   template<

      class Colln,

      class V

   > parallel_algorithm<accumulate_processor<Colln, V>> __fastcall FORCE_INLINE

   accumulate(Colln const &c, V const &v);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A write_lock_type is taken, the collection c is resized, the write_lock_type is decayed() to a read_lock_type, then for each iterator, iter, in the range [c.begin(), c.end()) the value v is assigned in some unspecified order. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all of the assignments have completed. It is unspecified if the assignments will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of the assignment, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor

of the execution_context. If the thread_pool is nonjoinable(), or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of *iter=Colln::value_type(v) is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param   c  An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators.

      \param   sz The number of items to place into the collection.

      \param   v  The value used to copy into the collection.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see std::fill_n()

      \see subdivide_n_gen_wk1

      \see fill_n_t

      \see execution_context

   */

   template<

      class Colln

   > parallel_algorithm<fill_n_t<Colln> > __fastcall FORCE_INLINE

   fill_n(Colln &c, typename Colln::size_type sz, typename Colln::value_type const &v);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A read_lock_type is taken then for each iterator, iter, in the range [c.begin(), c.end()) the value v is assigned in some unspecified order. The application and iterator pairs are enqueued in the thread_pool for eventual application. The read_lock_type on the collection is released when all of the assignments have completed. It is unspecified if the assignments will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all applications are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of the assignment, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable(), or the client omits

to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of *iter=Colln::value_type(v) is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param   c  An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators.

      \param   v  The value used to copy into the collection.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see std::fill()

      \see subdivide_n_gen_wk1

      \see fill_t

      \see execution_context

   */

   template<

      class Colln

   > parallel_algorithm<fill_t<Colln> > __fastcall FORCE_INLINE

   fill(Colln &c, typename Colln::value_type const &v);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      A read_lock_type is taken then the range is reversed, the iterator pairs that are swapped are done in some unspecified order. The read_lock_type on the collection is released when all of the assignments have completed. It is unspecified if the swaps will complete before the function returns. For a joinable_t thread_pool, and if an execution_context is constructed from the return-type, and the client requests a synchronisation on the execution_context, then all swaps are guaranteed to have completed, in some unspecified order and manner (e.g. due to exceptions). If an application causes an exception to be thrown that progresses beyond the stack frame of the assignment, for any case, then the last such exception thrown will be re-thrown either when the client requests that synchronisation, or in the dtor of the execution_context. If the thread_pool is nonjoinable(), or the client omits to construct the execution_context, then that exception is propagated to the dtor of the thread_pool, and it is

unspecified when those applications will complete, but they will complete sometime before the dtor of the thread_pool exits.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of the iter_swaps are more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param   c  An adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support bidirectional-iterators.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see std::reverse()

      \see subdivide_n_gen_wk1

      \see reverse_t

      \see execution_context

   */

   template<

      typename Colln

   > parallel_algorithm<reverse_t<Colln> > __fastcall FORCE_INLINE

   reverse(Colln &c);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   c  An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   comp  The comparator to use to compare the items.

      \return  An execution_context that may be waited upon to obtain the result, which is the smallest value in the collection, not an iterator to it.


      \see std::max_element()

      \see max_element_t

      \see execution_context

   */

   template<

      class Colln,

      class Comp

   > parallel_algorithm<max_element_t<Colln, Comp>> __fastcall FORCE_INLINE

   max_element(Colln const &c, Comp const &comp);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   c  An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \return  An execution_context that may be waited upon to obtain the result, which is the smallest value in the collection, not an iterator to it.


      \see std::max_element()

      \see max_element_t

      \see execution_context

   */

   template<

      typename Colln

   > parallel_algorithm<max_element_t<Colln, std::less<typename Colln::value_type>>> __fastcall FORCE_INLINE

   max_element(Colln const &c);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   c  An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \param   comp  The comparator to use to compare the items.

      \return  An execution_context that may be waited upon to obtain the result, which is the smallest value in the collection, not an iterator to it.


      \see std::min_element()

      \see min_element_t

      \see execution_context

   */

   template<

      class Colln,

      class Comp

   > parallel_algorithm<min_element_t<Colln, Comp>> __fastcall FORCE_INLINE

   min_element(Colln const &c, Comp const &comp);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   c  An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators.

      \return  An execution_context that may be waited upon to obtain the result, which is the smallest value in the collection, not an iterator to it.


      \see std::min_element()

      \see min_element_t

      \see execution_context

   */

   template<

      typename Colln

   > parallel_algorithm<min_element_t<Colln, std::less<typename Colln::value_type>>> __fastcall FORCE_INLINE

   min_element(Colln const &c);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      Complexity: O(log^2(n.log(n)/p+log(p))) on average, if std::stable_sort() is used as the underlying sort algorithm, with enough memory.


      This implements Batcher's Bitonic Merge rather than Cole's parallel merge in section 5.1 pg 184 in [1] or in section 2.2.2 in [2] because of the results from [3].


      Take out a temporary read lock on the input collection and a write lock on the output collection to resize the output collection, so that it has the right number of iterators. Then take out read-locks on both, so that the write lock on the output collection is dropped, as we are now no longer modifying the actual collection, but the contents of each iterator.


      This algorithm makes at most O(min(log(p), log(n)) memory allocations.


      \param   in1   The first input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.

      \param   in2   The second input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.

      \param   out   The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and must have a recursive lock specified for it, otherwise an exception will be thrown, optionally a decaying_write lock.

      \param   comp  A std::binary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If comp is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.

      [2] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.

      [3] Natvig, L., "Logarithmic time cost optimal parallel sorting is not yet fast in practice!", ACM/IEEE, 1990.


      \see std::merge, sort(), merge_t

   */

   template<

      typename CollnIn1,

      typename CollnIn2,

      typename CollnOut,

      class Compare

   > parallel_algorithm<merge_t<CollnIn1, CollnIn2, CollnOut, Compare> > __fastcall FORCE_INLINE

   merge(CollnIn1 const &in1, CollnIn2 const &in2, CollnOut &out, Compare const &comp);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   in1   The first input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.

      \param   in2   The second input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.

      \param   out   The output adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward output-iterators and optionally a decaying_write lock.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see merge()

   */

   template<

      typename CollnIn1,

      typename CollnIn2,

      typename CollnOut

   > parallel_algorithm<merge_t<CollnIn1, CollnIn2, CollnOut, std::less<typename CollnOut::value_type> > > __fastcall FORCE_INLINE

   merge(CollnIn1 const &in1, CollnIn2 const &in2, CollnOut &out);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      Complexity: O(log^2(n.log(n)/p+log(p))) on average, if std::stable_sort() is used as the underlying sort algorithm, with enough memory.


      This implements Batcher's Bitonic Merge rather than Cole's parallel merge in section 5.1 pg 184 in [1] or in section 2.2.2 in [2] because of the results from [3].


      Take out a temporary read lock on the input collection and a write lock on the output collection to resize the output collection, so that it has the right number of iterators. Then take out read-locks on both, so that the write lock on the output collection is dropped, as we are now no longer modifying the actual collection, but the contents of each iterator.


      This algorithm makes at most O((min(log(p), log(n))^2) memory allocations.


      \param   c  An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.

      \param   comp  A comparison function. A std::binary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If comp is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      [1] Alan Gibbons, Wojciech Rytter, "Efficient Parallel Algorithms", Cambridge University Press, 1989.

      [2] Casanova, H., Legrand, A., Robert, Y., "Parallel Algorithms", CRC Press, 2008.

      [3] Natvig, L., "Logarithmic time cost optimal parallel sorting is not yet fast in practice!", ACM/IEEE, 1990.


      \see std::sort()

      \see merge()

      \see sort_t

   */

   template<

      class Colln,

      typename Compare

   > parallel_algorithm<sort_t<Colln, Compare> > __fastcall FORCE_INLINE

   sort(Colln &c, Compare const &comp);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      \param   in An input adapted collection, where the adaptor assists in providing thread safety, e.g. safe_colln. The collection should support forward input-iterators. Note that this should be a power of 2 in size.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see sort()

   */

   template<

      typename Colln

   > parallel_algorithm<sort_t<Colln, std::less<typename Colln::value_type> > > __fastcall FORCE_INLINE

   sort(Colln &in);


   /// An adaptor to allow STL unary functions to be operated upon in the thread_pool.

   /**

         Note that the input is evaluated by transferring it into the pool, and the execution_context that holds the result has an automatic conversion to the result_type. Also note that using thread_wk_ts directly would be faster than using this, but this allows for pipelining of evaluation of the input argument then evaluation of the UniFn.


      \todo JMG: If there are no side-effects, one might implement compile-time reduction of boolean expressions implemented using logical_and() & logical_or(), see Kennedy & Allen, "Advanced Optimizing Compilers".


      \param a The parameter, that should have a mutator, process(), that has a result_type of boolean.

      \param op   The binary operation to apply to the input parameters, this also allows for potential of nesting binary_funs to form a control-flow tree.

      \return  An execution_context that may be waited upon to determine when all the two mutations are complete, and the result is available.


      \see thread_wk_t

      \see closure_base

      \see std::unary_fun()

   */

   template<

      class ArgT,

      class UniFn

   >

   execution_context_stack<unary_fun_work_type<ArgT, UniFn, thread_pool_base>> __fastcall FORCE_INLINE

   unary_fun(ArgT &&a, UniFn const &op=UniFn());


   /// An adaptor to allow STL binary functions to be operated upon in the thread_pool. Note that the inputs are evaluated by transferring them into the pool, and the execution_context that holds the result has an automatic conversion to the result_type.

   /**

      Please note that the order of completion of the arguments is undefined. Technically, the left-hand argument is enqueued then the right. The execution_context that holds the result is only released when both arguments have been evaluated. This guarantees that any exceptions thrown in the course of evaluating the arguments will be correctly rethrown by the execution_context. Also note that this functionality allows for pipelining of evaluation of the input arguments then evaluation of the BinFn.


      \todo JMG: If there are no side-effects, one might implement compile-time reduction of boolean expressions implemented using logical_and() & logical_or(), see Kennedy & Allen, "Advanced Optimizing Compilers".


      \param lhs  A restricted thread_wk_t, one in which the external state is only read-only, that should have a mutator, process(), that has a result_type of boolean.

      \param rhs  A restricted thread_wk_t, one in which the external state is only read-only, that should have a mutator, process(), that has a result_type of boolean.

      \param op   The binary operation to apply to the input parameters, this also allows for potential of nesting binary_funs to form a control-flow tree.

      \return  An execution_context that may be waited upon to determine when all the two mutations are complete, and the result is available.


      \see thread_wk_t

      \see closure_base

      \see std::binary_fun()

   */

   template<

      class LHSArg,

      class RHSArg,

      class BinFn

   >

   execution_context_stack<binary_fun_work_type<LHSArg, RHSArg, BinFn, thread_pool_base>> __fastcall FORCE_INLINE

   binary_fun(LHSArg &&lhs, RHSArg &&rhs, BinFn const &op=BinFn());


   /// A parallel implementation of the STL logical operation of the same name.

   /**

      Note that unlike the standard, this implementation does not short-circuit the operation, therefore side-effects in the second argument may occur in situations in which they would not have using the standard implementation. The order of evaluation of the arguments is undefined.


      \param lhs  A functor that should be a restricted thread_wk_t, one in which the external state is only read-only, note that the process(bool &) function must be a const member-function, to assist in enforcing this requirement.

      \param rhs  A functor that should be a restricted thread_wk_t, one in which the external state is only read-only, note that the process(bool &) function must be a const member-function, to assist in enforcing this requirement.

      \return  An execution_context that may be waited upon to determine when all the two mutations are complete, and the result is available. As the inputs are pure, they have no side-effects, so the only way to detect any effect is via the execution_context, so this is by default joinable_t.


      \see binary_fun()

      \see thread_wk_t

      \see closure_base

      \see stl::logical_and()

   */

   template<

      class T

   >

   execution_context_stack<binary_fun_work_type<T const, T const, std::logical_and<bool>, thread_pool_base>> __fastcall FORCE_INLINE

   logical_and(T &&lhs, T &&rhs);


   /// A parallel implementation of the STL logical operation of the same name.

   /**

      Note that unlike the standard, this implementation does not short-circuit the operation, therefore side-effects in the second argument may occur in situations in which they would not have using the standard implementation. The order of evaluation of the arguments is undefined.


      \param lhs  A functor that should be a restricted thread_wk_t, one in which the external state is only read-only, note that the process(bool &) function must be a const member-function, to assist in enforcing this requirement.

      \param rhs  A functor that should be a restricted thread_wk_t, one in which the external state is only read-only, note that the process(bool &) function must be a const member-function, to assist in enforcing this requirement.

      \return  An execution_context that may be waited upon to determine when the two mutations are complete, and the result is available. As the inputs are pure, they have no side-effects, so the only way to detect any effect is via the execution_context, so this is by default joinable_t.


      \see binary_fun()

      \see thread_wk_t

      \see closure_base

      \see stl::logical_or()

   */

   template<

      class T

   >

   execution_context_stack<binary_fun_work_type<T const, T const, std::logical_or<bool>, thread_pool_base>> __fastcall FORCE_INLINE

   logical_or(T &&lhs, T &&rhs);


   /**

      \todo Implement using the advice given in "Standard C++ IOStreams and Locales" by A.Langer & K.Kreft, page 170.

   */

   template<

      class DM1,

      pool_traits::size_mode_t Ps1,

      typename PTT1,

      class Pt1

   >

   friend tostream &__fastcall

   operator<<(tostream &os, thread_pool_base<DM1, Ps1, PTT1, Pt1> const &t);


   /// Access the control-flow graph, if supported.

   cfg_type & cfg() noexcept(true) FORCE_INLINE {

      return cfg_;

   }

   /// Access the control-flow graph, if supported.

   cfg_type const & cfg() const noexcept(true) FORCE_INLINE {

      return cfg_;

   }


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name. Calls std::iter_swap(i, j) for the (potentially overlapping) ranges i in [b1, e1) & j in [b2, b2+(e1-b1)) for each iterator pair (i, j) where p(*i, *j)==true.

   /**

      Note that this algorithm is an advanced algorithm: no locking of the underlying collection(s) to which the iterators access are taken, so if the iterator(s) is(are) invalidated or if the output range is not large enough, then UB will occur.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of evaluating "if (p(iterator::value_type const &, iterator::value_type const &)) std::iter_swap()" is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param b1   The input-iterator to the start of the first range.

      \param e1   The output-iterator to end of the first range.

      \param b2   The input-iterator to the start of the second range.

      \param p    A binary-function predicate that returns bool and takes two arguments of type iterator::value_type. A std::binary_function predicate-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::swap_ranges(). If p is re-entrant and is thread-safe, then, given the particular nature of that predicate, the library may be able to maintain its thread-safety guarantees.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see std::swap_ranges()

      \see subdivide_n_gen_wk1

      \see swap_ranges_t

      \see execution_context

   */

   template<

      class Colln,

      typename Pred

   > parallel_algorithm<swap_ranges_t<Colln, Pred> > __fastcall FORCE_INLINE

   swap_ranges(typename Colln::container_type::iterator b1, typename Colln::container_type::iterator e1, typename Colln::container_type::iterator b2, Pred const &p);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      Note that this algorithm is an advanced algorithm: no locking of the underlying collection(s) to which the iterators access are taken, so if the iterator(s) is(are) invalidated or if the output range is not large enough, then UB will occur.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of out.insert(op(CollnIn::value_type)) is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param b1   The input-iterator to the start of the first range.

      \param e1   The output-iterator to end of the first range.

      \param b2   The input-iterator to the start of the second range.

      \param   uniop A std::unary_function-type that need not be thread-safe, nor support re-entrancy, but in other respects the same as for std::for_each(). If f is re-entrant and is thread-safe, then, given the particular nature of that functor, the library may be able to maintain its thread-safety guarantees.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see copy(), transform()

      \see transform_t

   */

   template<

      typename CollnIn,

      typename CollnOut,

      class IterIn,

      class UniOp

   > parallel_algorithm<transform_iter_t<CollnIn, CollnOut, IterIn, UniOp> > __fastcall FORCE_INLINE

   transform(IterIn b1, IterIn e1, typename CollnOut::container_type::iterator b2, UniOp const &uniop);


   /// An adaptor for the parallel equivalent of the STL algorithm of the same name.

   /**

      Note that this algorithm is an advanced algorithm: no locking of the underlying collection(s) to which the iterators access are taken, so if the iterator(s) is(are) invalidated or if the output range is not large enough, then UB will occur.


      Also this algorithm will be faster than a single-threaded algorithm if the cost of copying each element is more expensive than the threading costs in the pool.


      This algorithm makes at most 1 memory allocation.


      \param b1   The input-iterator to the start of the first range.

      \param e1   The output-iterator to end of the first range.

      \param b2   The input-iterator to the start of the second range.

      \return  An opaque type, derived from an execution_context, returned from operator<<(), that must be captured as "auto const &" or "auto &&", that may be waited upon to determine when all of the applications of f are complete.


      \see copy(), transform()

      \see transform_t

   */

   template<

      class CollnIn,

      class CollnOut,

      class IterIn

   > parallel_algorithm<copy_iter_t<CollnIn, CollnOut, IterIn> > __fastcall FORCE_INLINE

   copy(IterIn b1, IterIn e1, typename CollnOut::container_type::iterator b2);


protected:

   cfg_type cfg_;


   explicit thread_pool_base(const pool_size_type max_num_threads) noexcept(false) FORCE_INLINE

   : max_num_threads_in_pool(max_num_threads) {

      thread_traits::set_backtrace_on_signal();

      thread_traits::set_backtrace_on_terminate();

   }


   /// Obtain access to any statistics data collected by the operation of the thread_pool.

   virtual statistics_type &__fastcall set_statistics() noexcept(true)=0;


   virtual signalled_work_queue_type & __fastcall queue() noexcept(true)=0;

   virtual signalled_work_queue_type const & __fastcall queue() const noexcept(true)=0;


   virtual void __fastcall add_nonjoinable_work(typename signalled_work_queue_type::value_type &&) noexcept(false)=0;

   virtual typename signalled_work_queue_type::value_type __fastcall add_joinable_work(typename signalled_work_queue_type::value_type &&) noexcept(false)=0;


   /**

      \return  True if there is more closure_base-derived closure to process() in the batch, otherwise false.

   */

   virtual bool __fastcall process_a_batch_item(const typename thread_traits::api_params_type::tid_type, typename os_traits::thread_exception const &) noexcept(false) FORCE_INLINE {

      return false;

   }


   template<class ExecCtx>

   typename ExecCtx::chk_argument_type __fastcall FORCE_INLINE

   make_arg(typename signalled_work_queue_type::value_type &&async_wk) {

      return ExecCtx::template make_arg<typename ExecCtx::result_type>(

         std::forward<typename signalled_work_queue_type::value_type>(async_wk),

         this

      );

   }


   virtual typename pool_traits_type::template exit_requested_type<typename work_distribution_mode::queue_model> &exit_requested() noexcept(true)=0;


private:

   template<class TPB> friend class joinable_t;

   template<class TPB> friend class nonjoinable_t;

   template<class TPB> friend class nonjoinable_buff_t;

   template<template<class> class Joinability, class TPB, typename TPB::priority_type Pri> friend class priority_t;

   template<class DM1, generic_traits::return_data RD, class TPB, class Wk> friend class execution_context_stack_type;

   template<class DM1, generic_traits::return_data RD, class TPB, template<class, class, template<class> class, template<class> class> class CoreWk, class AlgoWrapT, class Wk> friend class execution_context_algo_stack_type;

   template<generic_traits::return_data RD, class TPB, template<class> class Del, template<class> class AtCtr> friend class horizontal_execution;

};


} } } }


#include "thread_pool_base_impl.hpp"


#endif